function getSelectedCategory() {
const allCheckbox = document.getElementById('categoryAll');
- if (allCheckbox.checked) {
- return "All";
- }
- return document.querySelector('input[name="categoryGroup"]:checked');
+ return document.querySelectorAll('input[name="categoryGroup"]:checked');
}
function triggerSearch(query) {
@@ -108,13 +120,13 @@
if (query) {
trucatedQuery = truncateSentence(query, 20);
- if (getSelectedCategory() == null) {
+ if (getSelectedCategory().length === 0) {
searchResultsHeader.textContent = 'Select the result type for your search.';
document.getElementById('searchPageResultsContainer').innerHTML = '';
return;
}
- const selectedCategory = getSelectedCategory().value;
- const searchType = selectedCategory == ("Documentation") ? "docs" : selectedCategory == ("News") ? "proj" : "all";
+ const selectedCategory = getSelectedCategory();
+ const searchType = Array.from(selectedCategory).map(element => element.value).join(',');
const urlPath = window.location.pathname;
const versionMatch = urlPath.match(/(\d+\.\d+)/);
const docsVersion = versionMatch ? versionMatch[1] : "latest";
@@ -139,11 +151,12 @@
document.addEventListener('DOMContentLoaded', function() {
const categoryAll = document.getElementById('categoryAll');
const categoryDocumentation = document.getElementById('categoryDocumentation');
- const categoryNews = document.getElementById('categoryNews');
+ const categoryBlog = document.getElementById('categoryBlog');
+ const categoryEvent = document.getElementById('categoryEvent');
const searchInput = document.getElementById('searchPageInput');
function updateAllCheckbox() {
- if (categoryDocumentation.checked && categoryNews.checked) {
+ if (categoryDocumentation.checked && categoryBlog.checked && categoryEvent.checked) {
categoryAll.checked = true;
} else {
categoryAll.checked = false;
@@ -153,10 +166,12 @@
function updateChildCheckboxes() {
if (categoryAll.checked) {
categoryDocumentation.checked = true;
- categoryNews.checked = true;
+ categoryBlog.checked = true;
+ categoryEvent.checked = true;
} else {
categoryDocumentation.checked = false;
- categoryNews.checked = false;
+ categoryBlog.checked = false;
+ categoryEvent.checked = false;
}
}
@@ -168,7 +183,11 @@
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
- categoryNews.addEventListener('change', () => {
+ categoryBlog.addEventListener('change', () => {
+ updateAllCheckbox();
+ triggerSearch(searchInput.value.trim());
+ });
+ categoryEvent.addEventListener('change', () => {
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
diff --git a/_ml-commons-plugin/api/async-batch-ingest.md b/_ml-commons-plugin/api/async-batch-ingest.md
new file mode 100644
index 0000000000..ace95ba4d4
--- /dev/null
+++ b/_ml-commons-plugin/api/async-batch-ingest.md
@@ -0,0 +1,97 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+parent: ML Commons APIs
+has_children: false
+has_toc: false
+nav_order: 35
+---
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+Use the Asynchronous Batch Ingestion API to ingest data into your OpenSearch cluster from your files on remote file servers, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. For detailed configuration steps, see [Asynchronous batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/async-batch-ingestion/).
+
+## Path and HTTP methods
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+```
+
+#### Request fields
+
+The following table lists the available request fields.
+
+Field | Data type | Required/Optional | Description
+:--- | :--- | :---
+`index_name`| String | Required | The index name.
+`field_map` | Object | Required | Maps fields from the source file to specific fields in an OpenSearch index for ingestion.
+`ingest_fields` | Array | Optional | Lists fields from the source file that should be ingested directly into the OpenSearch index without any additional mapping.
+`credential` | Object | Required | Contains the authentication information for accessing external data sources, such as Amazon S3 or OpenAI.
+`data_source` | Object | Required | Specifies the type and location of the external file(s) from which the data is ingested.
+`data_source.type` | String | Required | Specifies the type of the external data source. Valid values are `s3` and `openAI`.
+`data_source.source` | Array | Required | Specifies one or more file locations from which the data is ingested. For `s3`, specify the file path to the Amazon S3 bucket (for example, `["s3://offlinebatch/output/sagemaker_batch.json.out"]`). For `openAI`, specify the file IDs for input or output files (for example, `["file-", "file-", "file-"]`).
+
+## Example request: Ingesting a single file
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example request: Ingesting multiple files
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example response
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md
index 64790bb57f..625d58bb62 100644
--- a/_ml-commons-plugin/api/connector-apis/update-connector.md
+++ b/_ml-commons-plugin/api/connector-apis/update-connector.md
@@ -29,17 +29,20 @@ PUT /_plugins/_ml/connectors/
The following table lists the updatable fields. For more information about all connector fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters).
-| Field | Data type | Description |
-| :--- | :--- | :--- |
-| `name` | String | The name of the connector. |
-| `description` | String | A description of the connector. |
-| `version` | Integer | The version of the connector. |
-| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
+| Field | Data type | Description |
+| :--- |:------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | The name of the connector. |
+| `description` | String | A description of the connector. |
+| `version` | Integer | The connector version. |
+| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
| `credential` | JSON object | Defines any credential variables required in order to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `parameters.skip_validating_missing_parameters` | Boolean | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
+
+
#### Example request
diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md
index 7b06cfefe8..6acd926444 100644
--- a/_ml-commons-plugin/api/execute-algorithm.md
+++ b/_ml-commons-plugin/api/execute-algorithm.md
@@ -2,7 +2,7 @@
layout: default
title: Execute algorithm
parent: ML Commons APIs
-nav_order: 30
+nav_order: 37
---
# Execute algorithm
diff --git a/_ml-commons-plugin/api/model-apis/batch-predict.md b/_ml-commons-plugin/api/model-apis/batch-predict.md
index b32fbb108d..c1dc7348fe 100644
--- a/_ml-commons-plugin/api/model-apis/batch-predict.md
+++ b/_ml-commons-plugin/api/model-apis/batch-predict.md
@@ -31,7 +31,13 @@ POST /_plugins/_ml/models//_batch_predict
## Prerequisites
-Before using the Batch Predict API, you need to create a connector to the externally hosted model. For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request:
+Before using the Batch Predict API, you need to create a connector to the externally hosted model. For each action, specify the `action_type` parameter that describes the action:
+
+- `batch_predict`: Runs the batch predict operation.
+- `batch_predict_status`: Checks the batch predict operation status.
+- `cancel_batch_predict`: Cancels the batch predict operation.
+
+For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request. The `cancel_batch_predict` action is optional and supports canceling the batch job running on OpenAI:
```json
POST /_plugins/_ml/connectors/_create
@@ -68,6 +74,22 @@ POST /_plugins/_ml/connectors/_create
"Authorization": "Bearer ${credential.openAI_key}"
},
"request_body": "{ \"input_file_id\": \"${parameters.input_file_id}\", \"endpoint\": \"${parameters.endpoint}\", \"completion_window\": \"24h\" }"
+ },
+ {
+ "action_type": "batch_predict_status",
+ "method": "GET",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
+ },
+ {
+ "action_type": "cancel_batch_predict",
+ "method": "POST",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}/cancel",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
}
]
}
@@ -123,45 +145,87 @@ POST /_plugins/_ml/models/lyjxwZABNrAVdFa9zrcZ/_batch_predict
#### Example response
+The response contains the task ID for the batch predict operation:
+
```json
{
- "inference_results": [
- {
- "output": [
- {
- "name": "response",
- "dataAsMap": {
- "id": "batch_",
- "object": "batch",
- "endpoint": "/v1/embeddings",
- "errors": null,
- "input_file_id": "file-",
- "completion_window": "24h",
- "status": "validating",
- "output_file_id": null,
- "error_file_id": null,
- "created_at": 1722037257,
- "in_progress_at": null,
- "expires_at": 1722123657,
- "finalizing_at": null,
- "completed_at": null,
- "failed_at": null,
- "expired_at": null,
- "cancelling_at": null,
- "cancelled_at": null,
- "request_counts": {
- "total": 0,
- "completed": 0,
- "failed": 0
- },
- "metadata": null
- }
- }
- ],
- "status_code": 200
- }
- ]
+ "task_id": "KYZSv5EBqL2d0mFvs80C",
+ "status": "CREATED"
}
```
-For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
\ No newline at end of file
+To check the status of the batch predict job, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). You can find the job details in the `remote_job` field in the task. Once the prediction is complete, the task `state` changes to `COMPLETED`.
+
+#### Example request
+
+```json
+GET /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+The response contains the batch predict operation details in the `remote_job` field:
+
+```json
+{
+ "model_id": "JYZRv5EBqL2d0mFvKs1E",
+ "task_type": "BATCH_PREDICTION",
+ "function_name": "REMOTE",
+ "state": "RUNNING",
+ "input_type": "REMOTE",
+ "worker_node": [
+ "Ee5OCIq0RAy05hqQsNI1rg"
+ ],
+ "create_time": 1725491751455,
+ "last_update_time": 1725491751455,
+ "is_async": false,
+ "remote_job": {
+ "cancelled_at": null,
+ "metadata": null,
+ "request_counts": {
+ "total": 3,
+ "completed": 3,
+ "failed": 0
+ },
+ "input_file_id": "file-XXXXXXXXXXXX",
+ "output_file_id": "file-XXXXXXXXXXXXX",
+ "error_file_id": null,
+ "created_at": 1725491753,
+ "in_progress_at": 1725491753,
+ "expired_at": null,
+ "finalizing_at": 1725491757,
+ "completed_at": null,
+ "endpoint": "/v1/embeddings",
+ "expires_at": 1725578153,
+ "cancelling_at": null,
+ "completion_window": "24h",
+ "id": "batch_XXXXXXXXXXXXXXX",
+ "failed_at": null,
+ "errors": null,
+ "object": "batch",
+ "status": "in_progress"
+ }
+}
+```
+
+For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
+
+### Canceling a batch predict job
+
+You can also cancel the batch predict operation running on the remote platform using the task ID returned by the batch predict request. To add this capability, set the `action_type` to `cancel_batch_predict` in the connector configuration when creating the connector.
+
+#### Example request
+
+```json
+POST /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C/_cancel_batch
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+```json
+{
+ "status": "OK"
+}
+```
diff --git a/_ml-commons-plugin/remote-models/async-batch-ingestion.md b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
new file mode 100644
index 0000000000..a09c028477
--- /dev/null
+++ b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
@@ -0,0 +1,190 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+nav_order: 90
+parent: Connecting to externally hosted models
+grand_parent: Integrating ML models
+---
+
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+[Batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/batch-ingestion/) configures an ingest pipeline, which processes documents one by one. For each document, batch ingestion calls an externally hosted model to generate text embeddings from the document text and then ingests the document, including text and embeddings, into an OpenSearch index.
+
+An alternative to this real-time process, _asynchronous_ batch ingestion, ingests both documents and their embeddings generated outside of OpenSearch and stored on a remote file server, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. Asynchronous ingestion returns a task ID and runs asynchronously to ingest data offline into your k-NN cluster for neural search. You can use asynchronous batch ingestion together with the [Batch Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) to perform inference asynchronously. The batch predict operation takes an input file containing documents and calls an externally hosted model to generate embeddings for those documents in an output file. You can then use asynchronous batch ingestion to ingest both the input file containing documents and the output file containing their embeddings into an OpenSearch index.
+
+As of OpenSearch 2.17, the Asynchronous Batch Ingestion API is supported by Amazon SageMaker, Amazon Bedrock, and OpenAI.
+{: .note}
+
+## Prerequisites
+
+Before using asynchronous batch ingestion, you must generate text embeddings using a model of your choice and store the output on a file server, such as Amazon S3. For example, you can store the output of a Batch API call to an Amazon SageMaker text embedding model in a file with the Amazon S3 output path `s3://offlinebatch/output/sagemaker_batch.json.out`. The output is in JSONL format, with each line representing a text embedding result. The file contents have the following format:
+
+```
+{"SageMakerOutput":[[-0.017166402,0.055771016,...],[-0.06422759,-0.004301484,...],"content":["this is chapter 1","harry potter"],"id":1}
+{"SageMakerOutput":[[-0.017455402,0.023771016,...],[-0.02322759,-0.009101284,...],"content":["this is chapter 2","draco malfoy"],"id":1}
+...
+```
+
+## Ingesting data from a single file
+
+First, create a k-NN index into which you'll ingest the data. The fields in the k-NN index represent the structure of the data in the source file.
+
+In this example, the source file holds documents containing titles and chapters, along with their corresponding embeddings. Thus, you'll create a k-NN index with the fields `id`, `chapter_embedding`, `chapter`, `title_embedding`, and `title`:
+
+```json
+PUT /my-nlp-index
+{
+ "settings": {
+ "index.knn": true
+ },
+ "mappings": {
+ "properties": {
+ "id": {
+ "type": "text"
+ },
+ "chapter_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "chapter": {
+ "type": "text"
+ },
+ "title_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "title": {
+ "type": "text"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+When using an S3 file as the source for asynchronous batch ingestion, you must map the fields in the source file to fields in the index in order to indicate into which index each piece of data is ingested. If no JSON path is provided for a field, that field will be set to `null` in the k-NN index.
+
+In the `field_map`, indicate the location of the data for each field in the source file. You can also specify fields to be ingested directly into your index without making any changes to the source file by adding their JSON paths to the `ingest_fields` array. For example, in the following asynchronous batch ingestion request, the element with the JSON path `$.id` from the source file is ingested directly into the `id` field of your index. To ingest this data from the Amazon S3 file, send the following request to your OpenSearch endpoint:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+
+## Ingesting data from multiple files
+
+You can also ingest data from multiple files by specifying the file locations in the `source`. The following example ingests data from three OpenAI files.
+
+The OpenAI Batch API input file is formatted as follows:
+
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of life?", "The food was delicious and the waiter..."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of work?", "The travel was fantastic and the view..."]}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of friend?", "The old friend was far away and the time..."]}}
+...
+```
+
+The OpenAI Batch API output file is formatted as follows:
+
+```
+{"id": "batch_req_ITKQn29igorXCAGp6wzYs5IS", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "10845755592510080d13054c3776aef4", "body": {"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0044326545, ... ...]}, {"object": "embedding", "index": 1, "embedding": [0.002297497, ... ... ]}], "model": "text-embedding-ada-002", "usage": {"prompt_tokens": 15, "total_tokens": 15}}}, "error": null}
+...
+```
+
+If you have run the Batch API in OpenAI for text embedding and want to ingest the model input and output files along with some metadata into your index, send the following asynchronous ingestion request. Make sure to use `source[file-index]` to identify the file's location in the source array in the request body. For example, `source[0]` refers to the first file in the `data_source.source` array.
+
+The following request ingests seven fields into your index: Five are specified in the `field_map` section and two are specified in `ingest_fields`. The format follows the pattern `sourcefile.jsonPath`, indicating the JSON path for each file. In the field_map, `$.body.input[0]` is used as the JSON path to ingest data into the `question` field from the second file in the `source` array. The `ingest_fields` array lists all elements from the `source` files that will be ingested directly into your index:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+In the request, make sure to define the `_id` field in the `field_map`. This is necessary in order to map each data entry from the three separate files.
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+For request field descriptions, see [Asynchronous Batch Ingestion API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/async-batch-ingest/).
\ No newline at end of file
diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md
index 254a21b068..9b95c31166 100644
--- a/_ml-commons-plugin/remote-models/blueprints.md
+++ b/_ml-commons-plugin/remote-models/blueprints.md
@@ -55,19 +55,20 @@ As an ML developer, you can build connector blueprints for other platforms. Usin
## Configuration parameters
-| Field | Data type | Is required | Description |
-|:---|:---|:---|:---|
-| `name` | String | Yes | The name of the connector. |
-| `description` | String | Yes | A description of the connector. |
-| `version` | Integer | Yes | The version of the connector. |
-| `protocol` | String | Yes | The protocol for the connection. For AWS services such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint` and `model`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
-| `credential` | JSON object | Yes | Defines any credential variables required to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Yes | Defines what actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
-| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
-| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| Field | Data type | Is required | Description |
+|:-------------------------------------------------|:---|:------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | Yes | The name of the connector. |
+| `description` | String | Yes | A description of the connector. |
+| `version` | Integer | Yes | The connector version. |
+| `protocol` | String | Yes | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint`, `model`, and `skip_validating_missing_parameters`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
+| `credential` | JSON object | Yes | Defines any credential variables required for connecting to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the cluster connection is initiated, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
+| `actions` | JSON array | Yes | Defines the actions that can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
+| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| `parameters.skip_validating_missing_parameters` | Boolean | No | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
The `actions` parameter supports the following options.
@@ -76,12 +77,11 @@ The `actions` parameter supports the following options.
|:---|:---|:---|
| `action_type` | String | Required. Sets the ML Commons API operation to use upon connection. As of OpenSearch 2.9, only `predict` is supported. |
| `method` | String | Required. Defines the HTTP method for the API call. Supports `POST` and `GET`. |
-| `url` | String | Required. Sets the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints). |
-| `headers` | JSON object | Sets the headers used inside the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
+| `url` | String | Required. Specifies the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints).|
| `request_body` | String | Required. Sets the parameters contained in the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. |
| `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://platform.openai.com/docs/guides/embeddings) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
-
+| `headers` | JSON object | Specifies the headers used in the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
The `client_config` parameter supports the following options.
diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
index 7061d3cb5a..c4cc27f660 100644
--- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
+++ b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
@@ -7,7 +7,7 @@ nav_order: 10
# Semantic search using byte-quantized vectors
-This tutorial illustrates how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector).
+This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors).
The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors.
diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md
index bb4f9c6c30..156e985e8b 100644
--- a/_monitoring-your-cluster/pa/index.md
+++ b/_monitoring-your-cluster/pa/index.md
@@ -60,7 +60,7 @@ private-key-file-path = specify_path
The Performance Analyzer plugin is included in the installations for [Docker]({{site.url}}{{site.baseurl}}/opensearch/install/docker/) and [tarball]({{site.url}}{{site.baseurl}}/opensearch/install/tar/), but you can also install the plugin manually.
-To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://search.maven.org/search?q=org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
+To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://central.sonatype.com/namespace/org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
To start the Performance Analyzer root cause analysis (RCA) agent on a tarball installation, run the following command:
diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md
index 679237094a..ad6fa5950b 100644
--- a/_observing-your-data/ad/dashboards-anomaly-detection.md
+++ b/_observing-your-data/ad/dashboards-anomaly-detection.md
@@ -18,12 +18,12 @@ You can connect data visualizations to OpenSearch datasets and then create, run,
Before getting started, you must have:
- Installed OpenSearch and OpenSearch Dashboards version 2.9 or later. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/).
-- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins).
+- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]/({{site.url}}{{site.baseurl}}/install-and-configure/plugins/).
- Installed the Anomaly Detection Dashboards plugin version 2.9 or later. See [Managing OpenSearch Dashboards plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/) to get started.
## General requirements for anomaly detection visualizations
-Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information on real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-set-up-detector-jobs).
+Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information about real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-setting-up-detector-jobs).
Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization:
diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md
index 5dfa1b8f1a..657c3c90cb 100644
--- a/_observing-your-data/ad/index.md
+++ b/_observing-your-data/ad/index.md
@@ -10,30 +10,42 @@ redirect_from:
# Anomaly detection
-An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure.
+An _anomaly_ in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric can help identify early signs of a system failure.
-It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior.
+Conventional techniques like visualizations and dashboards can make it difficult to uncover anomalies. Configuring alerts based on static thresholds is possible, but this approach requires prior domain knowledge and may not adapt to data with organic growth or seasonal trends.
-Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
+Anomaly detection automatically detects anomalies in your OpenSearch data in near real time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an _anomaly grade_ and _confidence score_ value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Robust Random Cut Forest Based Anomaly Detection on Streams](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
You can pair the Anomaly Detection plugin with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to notify you as soon as an anomaly is detected.
+{: .note}
-To get started, choose **Anomaly Detection** in OpenSearch Dashboards.
-To first test with sample streaming data, you can try out one of the preconfigured detectors with one of the sample datasets.
+## Getting started with anomaly detection in OpenSearch Dashboards
+
+To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Anomaly Detection**.
## Step 1: Define a detector
-A detector is an individual anomaly detection task. You can define multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources.
+A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps:
+
+1. On the **Anomaly detection** page, select the **Create detector** button.
+2. On the **Define detector** page, enter the required information in the **Detector details** pane.
+3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias.
+4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
+#### Example: Filtering data using query DSL
+
+The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values:
+=======
1. Choose **Create detector**.
1. Add in the detector details.
- Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector.
1. Specify the data source.
- - For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indexes.
+ - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes.
+ - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation.
- (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
-#### Example filter using query DSL
-The query is designed to retrieve documents in which the `urlPath.keyword` field matches one of the following specified values:
+To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`.
+{: .note}
- /domain/{id}/short
- /sub_dir/{id}/short
@@ -62,40 +74,38 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
}
}
```
+ {% include copy-curl.html %}
-1. Specify a timestamp.
- - Select the **Timestamp field** in your index.
-1. Define operation settings.
- - For **Operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data.
- - The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model.
- The shorter you set this interval, the fewer data points the detector aggregates.
- The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals.
-
- - We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process.
+5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu.
- - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
+6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data.
+ - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals.
+ - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process.
+ - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
- This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay.
- - For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
-1. Specify custom results index.
- - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. To enable this, select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, like `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+ - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
+ - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection.
- You can use the dash “-” sign to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the "financial" department at a granular level for the "us" area.
+7. Specify a custom results index.
+ - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+
+ You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group.
{: .note }
- When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards.
- If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis.
- If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file.
- - To use the custom results index option, you need the following permissions:
- - `indices:admin/create` - The Anomaly Detection plugin requires the ability to create and roll over the custom index.
- - `indices:admin/aliases` - The Anomaly Detection plugin requires access to create and manage an alias for the custom index.
- - `indices:data/write/index` - You need the `write` permission for the Anomaly Detection plugin to write results into the custom index for a single-entity detector.
- - `indices:data/read/search` - You need the `search` permission because the Anomaly Detection plugin needs to search custom results indexes to show results on the Anomaly Detection UI.
- - `indices:data/write/delete` - Because the detector might generate a large number of anomaly results, you need the `delete` permission to delete old data and save disk space.
- - `indices:data/write/bulk*` - You need the `bulk*` permission because the Anomaly Detection plugin uses the bulk API to write results into the custom index.
- - Managing the custom results index:
- - The anomaly detection dashboard queries all detectors’ results from all custom results indexes. Having too many custom results indexes might impact the performance of the Anomaly Detection plugin.
- - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to rollover old results indexes. You can also manually delete or archive any old results indexes. We recommend reusing a custom results index for multiple detectors.
- - The Anomaly Detection plugin also provides lifecycle management for custom indexes. It rolls an alias over to a new index when the custom results index meets any of the conditions in the following table.
+ - To use the custom results index option, you must have the following permissions:
+ - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index.
+ - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index.
+ - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector.
+ - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface.
+ - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space.
+ - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index.
+ - When managing the custom results index, consider the following:
+ - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance.
+ - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended.
+ - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table.
Parameter | Description | Type | Unit | Example | Required
:--- | :--- |:--- |:--- |:--- |:---
@@ -103,43 +113,52 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
`result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No
`result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No
-1. Choose **Next**.
+8. Choose **Next**.
After you define the detector, the next step is to configure the model.
## Step 2: Configure the model
-#### Add features to your detector
+1. Add features to your detector.
-A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
+A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature.
-A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting.
-{: .note }
+A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting.
+{: .note}
+
+### Configuring a model based on an aggregation method
To configure an anomaly detection model based on an aggregation method, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Field Value**.
-1. For **aggregation method**, select either **average()**, **count()**, **sum()**, **min()**, or **max()**.
-1. For **Field**, select from the available options.
+1. On the **Detectors** page, select the desired detector from the list.
+2. On the detector's details page, select the **Actions** button to activate the dropdown menu and then select **Edit model configuration**.
+3. On the **Edit model configuration** page, select the **Add another feature** button.
+4. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+5. Select **Field value** from the dropdown menu under **Find anomalies based on**.
+6. Select the desired aggregation from the dropdown menu under **Aggregation method**.
+7. Select the desired field from the options listed in the dropdown menu under **Field**.
+8. Select the **Save changes** button.
+
+### Configuring a model based on a JSON aggregation query
To configure an anomaly detection model based on a JSON aggregation query, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Custom expression**. You will see the JSON editor window open up.
-1. Enter your JSON aggregation query in the editor.
-For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/)
-{: .note }
+1. On the **Edit model configuration** page, select the **Add another feature** button.
+2. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+3. Select **Custom expression** from the dropdown menu under **Find anomalies based on**. The JSON editor window will open.
+4. Enter your JSON aggregation query in the editor.
+5. Select the **Save changes** button.
-#### (Optional) Set category fields for high cardinality
+For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/).
+{: .note}
-You can categorize anomalies based on a keyword or IP field type.
+### Setting categorical fields for high cardinality
-The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues.
+You can categorize anomalies based on a keyword or IP field type. You can enable the **Categorical fields** option to categorize, or "slice," the source time series using a dimension, such as an IP address, a product ID, or a country code. This gives you a granular view of anomalies within each entity of the category field to help isolate and debug issues.
-To set a category field, choose **Enable a category field** and select a field. You can’t change the category fields after you create the detector.
+To set a category field, choose **Enable categorical fields** and select a field. You cannot change the category fields after you create the detector.
Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster:
@@ -147,7 +166,7 @@ Only a certain number of unique entities are supported in the category field. Us
(data nodes * heap size * anomaly detection maximum memory percentage) / (entity model size of a detector)
```
-To get the entity model size of a detector, use the [profile detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage with the `plugins.anomaly_detection.model_max_size_percent` setting.
+To get the detector's entity model size, use the [Profile Detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage using the `plugins.anomaly_detection.model_max_size_percent` setting.
Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the default 10% memory allocation. With an entity model size of 1 MB, the following formula calculates the estimated number of unique entities:
@@ -155,81 +174,109 @@ Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the de
(8096 MB * 0.1 / 1 MB ) * 3 = 2429
```
-If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), the anomaly detector will attempt to model the extra entities. The detector prioritizes entities that occur more often and are more recent.
+If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), then the anomaly detector attempts to model the extra entities. The detector prioritizes both entities that occur more often and are more recent.
-This formula serves as a starting point. Make sure to test it with a representative workload. You can find more information in the [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) blog post.
+This formula serves as a starting point. Make sure to test it with a representative workload. See the OpenSearch blog post [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) for more information.
{: .note }
-#### (Advanced settings) Set a shingle size
+### Setting a shingle size
-Set the number of aggregation intervals from your data stream to consider in a detection window. It’s best to choose this value based on your actual data to see which one leads to the best results for your use case.
+In the **Advanced settings** pane, you can set the number of data stream aggregation intervals to include in the detection window. Choose this value based on your actual data to find the optimal setting for your use case. To set the shingle size, select **Show** in the **Advanced settings** pane. Enter the desired size in the **intervals** field.
-The anomaly detector expects the shingle size to be in the range of 1 and 60. The default shingle size is 8. We recommend that you don't choose 1 unless you have two or more features. Smaller values might increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also false positives. Larger values might be useful for ignoring noise in a signal.
+The anomaly detector requires the shingle size to be between 1 and 128. The default is `8`. Use `1` only if you have at least two features. Values of less than `8` may increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also may increase false positives. Values greater than `8` may be useful for ignoring noise in a signal.
-#### Preview sample anomalies
+### Setting an imputation option
-Preview sample anomalies and adjust the feature settings if needed.
-For sample previews, the Anomaly Detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results.
+In the **Advanced settings** pane, you can set the imputation option. This allows you to manage missing data in your streams. The options include the following:
-Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results.
+- **Ignore Missing Data (Default):** The system continues without considering missing data points, keeping the existing data flow.
+- **Fill with Custom Values:** Specify a custom value for each feature to replace missing data points, allowing for targeted imputation tailored to your data.
+- **Fill with Zeros:** Replace missing values with zeros. This is ideal when the absence of data indicates a significant event, such as a drop to zero in event counts.
+- **Use Previous Values:** Fill gaps with the last observed value to maintain continuity in your time-series data. This method treats missing data as non-anomalous, carrying forward the previous trend.
-1. Choose **Preview sample anomalies**.
- - If you don't see any sample anomaly result, check the detector interval and make sure you have more than 400 data points for some entities during the preview date range.
-1. Choose **Next**.
+Using these options can improve recall in anomaly detection. For instance, if you are monitoring for drops in event counts, including both partial and complete drops, then filling missing values with zeros helps detect significant data absences, improving detection recall.
+
+Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information.
+{: note}
+
+### Suppressing anomalies with threshold-based rules
+
+In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations.
+
+Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns.
+
+To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules:
-## Step 3: Set up detector jobs
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value.
+```
+
+Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features.
+{: .note}
+
+If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds:
-To start a real-time detector to find anomalies in your data in near real-time, check **Start real-time detector automatically (recommended)**.
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value.
+```
-Alternatively, if you want to perform historical analysis and find patterns in long historical data windows (weeks or months), check **Run historical analysis detection** and select a date range (at least 128 detection intervals).
+If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature.
-Analyzing historical data helps you get familiar with the Anomaly Detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it.
+### Previewing sample anomalies
-We recommend experimenting with historical analysis with different feature sets and checking the precision before moving on to real-time detectors.
+You can preview anomalies based on sample feature input and adjust the feature settings as needed. The Anomaly Detection plugin selects a small number of data samples---for example, 1 data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. The sample dataset is loaded into the detector, which then uses the sample dataset to generate a preview of the anomalies.
+
+1. Choose **Preview sample anomalies**.
+ - If sample anomaly results are not displayed, check the detector interval to verify that 400 or more data points are set for the entities during the preview date range.
+2. Select the **Next** button.
-## Step 4: Review and create
+## Step 3: Setting up detector jobs
-Review your detector settings and model configurations to make sure that they're valid and then select **Create detector**.
+To start a detector to find anomalies in your data in near real time, select **Start real-time detector automatically (recommended)**.
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/review_ad.png)
+Alternatively, if you want to perform historical analysis and find patterns in longer historical data windows (weeks or months), select the **Run historical analysis detection** box and select a date range of at least 128 detection intervals.
-If you see any validation errors, edit the settings to fix the errors and then return back to this page.
+Analyzing historical data can help to familiarize you with the Anomaly Detection plugin. For example, you can evaluate the performance of a detector against historical data in order to fine-tune it.
+
+You can experiment with historical analysis by using different feature sets and checking the precision before using real-time detectors.
+
+## Step 4: Reviewing detector settings
+
+Review your detector settings and model configurations to confirm that they are valid and then select **Create detector**.
+
+If a validation error occurs, edit the settings to correct the error and return to the detector page.
{: .note }
-## Step 5: Observe the results
+## Step 5: Observing the results
-Choose the **Real-time results** or **Historical analysis** tab. For real-time results, you need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, because its waiting for sufficient data to generate anomalies.
+Choose either the **Real-time results** or **Historical analysis** tab. For real-time results, it will take some time to display the anomaly results. For example, if the detector interval is 10 minutes, then the detector may take an hour to initiate because it is waiting for sufficient data to be able to generate anomalies.
-A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner.
-Use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to make sure you have sufficient data points.
+A shorter interval results in the model passing the shingle process more quickly and generating anomaly results sooner. You can use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to ensure that you have enough data points.
-If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval.
+If the detector is pending in "initialization" for longer than 1 day, aggregate your existing data and use the detector interval to check for any missing data points. If you find many missing data points, consider increasing the detector interval.
-Choose and drag over the anomaly line chart to zoom in and see a more detailed view of an anomaly.
+Click and drag over the anomaly line chart to zoom in and see a detailed view of an anomaly.
{: .note }
-Analyze anomalies with the following visualizations:
+You can analyze anomalies using the following visualizations:
-- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
-- **Anomaly overview** (for real-time results) / **Anomaly history** (for historical analysis in the **Historical analysis** tab) plots the anomaly grade with the corresponding measure of confidence. This pane includes:
+- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is `10`, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
+- **Anomaly overview** (for real-time results) or **Anomaly history** (for historical analysis on the **Historical analysis** tab) plot the anomaly grade with the corresponding measure of confidence. The pane includes:
- The number of anomaly occurrences based on the given data-time range.
- - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly.
+ - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of `0` represents "not an anomaly," and a non-zero value represents the relative severity of the anomaly.
- **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy.
- **Last anomaly occurrence** is the time at which the last anomaly occurred.
-Underneath **Anomaly overview**/**Anomaly history** are:
+Underneath **Anomaly overview** or **Anomaly history** are:
- **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal.
- ![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
- **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly.
Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
-
If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0).
@@ -249,7 +296,7 @@ To see all the configuration settings for a detector, choose the **Detector conf
1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**.
- You need to stop real-time and historical analysis to change its configuration. Confirm that you want to stop the detector and proceed.
-1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
+2. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
## Step 8: Manage your detectors
diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md
index 7e1482a013..967b185684 100644
--- a/_observing-your-data/ad/result-mapping.md
+++ b/_observing-your-data/ad/result-mapping.md
@@ -9,9 +9,7 @@ redirect_from:
# Anomaly result mapping
-If you enabled custom result index, the anomaly detection plugin stores the results in your own index.
-
-If the anomaly detector doesn’t detect an anomaly, the result has the following format:
+When you select the **Enable custom result index** box on the **Custom result index** pane, the Anomaly Detection plugin will save the results to an index of your choosing. When the anomaly detector does not detect an anomaly, the result format is as follows:
```json
{
@@ -61,6 +59,7 @@ If the anomaly detector doesn’t detect an anomaly, the result has the followin
"threshold": 1.2368549346675202
}
```
+{% include copy-curl.html %}
## Response body fields
@@ -80,7 +79,83 @@ Field | Description
`model_id` | A unique ID that identifies a model. If a detector is a single-stream detector (with no category field), it has only one model. If a detector is a high-cardinality detector (with one or more category fields), it might have multiple models, one for each entity.
`threshold` | One of the criteria for a detector to classify a data point as an anomaly is that its `anomaly_score` must surpass a dynamic threshold. This field records the current threshold.
-If an anomaly detector detects an anomaly, the result has the following format:
+When the imputation option is enabled, the anomaly results include a `feature_imputed` array showing which features were modified due to missing data. If no features were imputed, then this is excluded.
+
+In the following example anomaly result output, the `processing_bytes_max` feature was imputed, as shown by the `imputed: true` status:
+
+```json
+{
+ "detector_id": "kzcZ43wBgEQAbjDnhzGF",
+ "schema_version": 5,
+ "data_start_time": 1635898161367,
+ "data_end_time": 1635898221367,
+ "feature_data": [
+ {
+ "feature_id": "processing_bytes_max",
+ "feature_name": "processing bytes max",
+ "data": 2322
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "feature_name": "processing bytes avg",
+ "data": 1718.6666666666667
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "feature_name": "processing bytes min",
+ "data": 1375
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "feature_name": "processing bytes sum",
+ "data": 5156
+ },
+ {
+ "feature_id": "processing_time_max",
+ "feature_name": "processing time max",
+ "data": 31198
+ }
+ ],
+ "execution_start_time": 1635898231577,
+ "execution_end_time": 1635898231622,
+ "anomaly_score": 1.8124904404395776,
+ "anomaly_grade": 0,
+ "confidence": 0.9802940756605277,
+ "entity": [
+ {
+ "name": "process_name",
+ "value": "process_3"
+ }
+ ],
+ "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_process_3",
+ "threshold": 1.2368549346675202,
+ "feature_imputed": [
+ {
+ "feature_id": "processing_bytes_max",
+ "imputed": true
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_time_max",
+ "imputed": false
+ }
+ ]
+}
+```
+{% include copy-curl.html %}
+
+When an anomaly is detected, the result is provided in the following format:
```json
{
@@ -179,24 +254,23 @@ If an anomaly detector detects an anomaly, the result has the following format:
"execution_start_time": 1635898427803
}
```
+{% include copy-curl.html %}
-You can see the following additional fields:
+Note that the result includes the following additional field.
Field | Description
:--- | :---
`relevant_attribution` | Represents the contribution of each input variable. The sum of the attributions is normalized to 1.
`expected_values` | The expected value for each feature.
-At times, the detector might detect an anomaly late.
-Let's say the detector sees a random mix of the triples {1, 2, 3} and {2, 4, 5} that correspond to `slow weeks` and `busy weeks`, respectively. For example 1, 2, 3, 1, 2, 3, 2, 4, 5, 1, 2, 3, 2, 4, 5, ... and so on.
-If the detector comes across a pattern {2, 2, X} and it's yet to see X, the detector infers that the pattern is anomalous, but it can't determine at this point which of the 2's is the cause. If X = 3, then the detector knows it's the first 2 in that unfinished triple, and if X = 5, then it's the second 2. If it's the first 2, then the detector detects the anomaly late.
+The detector may be late in detecting an anomaly. For example: The detector observes a sequence of data that alternates between "slow weeks" (represented by the triples {1, 2, 3}) and "busy weeks" (represented by the triples {2, 4, 5}). If the detector comes across a pattern {2, 2, X}, where it has not yet seen the value that X will take, then the detector infers that the pattern is anomalous. However, it cannot determine which 2 is the cause. If X = 3, then the first 2 is the anomaly. If X = 5, then the second 2 is the anomaly. If it is the first 2, then the detector will be late in detecting the anomaly.
-If a detector detects an anomaly late, the result has the following additional fields:
+When a detector is late in detecting an anomaly, the result includes the following additional fields.
Field | Description
:--- | :---
-`past_values` | The actual input that triggered an anomaly. If `past_values` is null, the attributions or expected values are from the current input. If `past_values` is not null, the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
-`approx_anomaly_start_time` | The approximate time of the actual input that triggers an anomaly. This field helps you understand when a detector flags an anomaly. Both single-stream and high-cardinality detectors don't query previous anomaly results because these queries are expensive operations. The cost is especially high for high-cardinality detectors that might have a lot of entities. If the data is not continuous, the accuracy of this field is low and the actual time that the detector detects an anomaly can be earlier.
+`past_values` | The actual input that triggered an anomaly. If `past_values` is `null`, then the attributions or expected values are from the current input. If `past_values` is not `null`, then the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
+`approx_anomaly_start_time` | The approximate time of the actual input that triggered an anomaly. This field helps you understand the time at which a detector flags an anomaly. Both single-stream and high-cardinality detectors do not query previous anomaly results because these queries are costly operations. The cost is especially high for high-cardinality detectors that may have many entities. If the data is not continuous, then the accuracy of this field is low and the actual time at which the detector detects an anomaly can be earlier.
```json
{
@@ -319,3 +393,4 @@ Field | Description
"approx_anomaly_start_time": 1635883620000
}
```
+{% include copy-curl.html %}
diff --git a/_observing-your-data/ad/security.md b/_observing-your-data/ad/security.md
index 8eeaa3df41..e4816cec46 100644
--- a/_observing-your-data/ad/security.md
+++ b/_observing-your-data/ad/security.md
@@ -23,6 +23,11 @@ As an admin user, you can use the Security plugin to assign specific permissions
The Security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles).
+If you use OpenSearch Dashboards to create your anomaly detectors, you may experience access issues even with `anomaly_full_access`. This issue has been resolved in OpenSearch 2.17, but for earlier versions, the following additional permissions need to be added:
+
+- `indices:data/read/search` -- You need this permission because the Anomaly Detection plugin needs to search the data source in order to validate whether there is enough data to train the model.
+- `indices:admin/mappings/fields/get` and `indices:admin/mappings/fields/get*` -- You need these permissions to validate whether the given data source has a valid timestamp field and categorical field (in the case of creating a high-cardinality detector).
+
If these roles don't meet your needs, mix and match individual anomaly detection [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors.
### A note on alerts and fine-grained access control
@@ -31,6 +36,42 @@ When a trigger generates an alert, the detector and monitor configurations, the
To reduce the chances of unintended users viewing metadata that could describe an index, we recommend that administrators enable role-based access control and keep these kinds of design elements in mind when assigning permissions to the intended group of users. See [Limit access by backend role](#advanced-limit-access-by-backend-role) for details.
+### Selecting remote indexes with fine-grained access control
+
+To use a remote index as a data source for a detector, see the setup steps in [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) in [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). You must use a role that exists in both the remote and local clusters. The remote cluster must map the chosen role to the same username as in the local cluster.
+
+---
+
+#### Example: Create a new user on the local cluster
+
+1. Create a new user on the local cluster to use for detector creation:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9200/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+```
+{% include copy-curl.html %}
+
+2. Map the new user to the `anomaly_full_access` role:
+
+```
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+3. On the remote cluster, create the same user and map `anomaly_full_access` to that role:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9250/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9250/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+---
+
+### Custom results index
+
+To use a custom results index, you need additional permissions not included in the default roles provided by the OpenSearch Security plugin. To add these permissions, see [Step 1: Define a detector]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-1-define-a-detector) in the [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/) documentation.
+
## (Advanced) Limit access by backend role
Use backend roles to configure fine-grained access to individual detectors based on roles. For example, users of different departments in an organization can view detectors owned by their own department.
diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md
new file mode 100644
index 0000000000..28cbcbb8e5
--- /dev/null
+++ b/_observing-your-data/query-insights/grouping-top-n-queries.md
@@ -0,0 +1,331 @@
+---
+layout: default
+title: Grouping top N queries
+parent: Query insights
+nav_order: 20
+---
+
+# Grouping top N queries
+**Introduced 2.17**
+{: .label .label-purple }
+
+Monitoring the [top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) can help you to identify the most resource-intensive queries based on latency, CPU, and memory usage in a specified time window. However, if a single computationally expensive query is executed multiple times, it can occupy all top N query slots, potentially preventing other expensive queries from appearing in the list. To address this issue, you can group similar queries, gaining insight into various high-impact query groups.
+
+Starting with OpenSearch version 2.17, the top N queries can be grouped by `similarity`, with additional grouping options planned for future version releases.
+
+## Grouping queries by similarity
+
+Grouping queries by `similarity` organizes them based on the query structure, removing everything except the core query operations.
+
+For example, the following query:
+
+```json
+{
+ "query": {
+ "bool": {
+ "must": [
+ { "exists": { "field": "field1" } }
+ ],
+ "query_string": {
+ "query": "search query"
+ }
+ }
+ }
+}
+```
+
+Has the following corresponding query structure:
+
+```c
+bool
+ must
+ exists
+ query_string
+```
+
+When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group.
+
+
+## Aggregate metrics per group
+
+In addition to retrieving latency, CPU, and memory metrics for individual top N queries, you can obtain aggregate statistics for the
+top N query groups. For each query group, the response includes the following statistics:
+- The total latency, CPU usage, or memory usage (depending on the configured metric type)
+- The total query count
+
+Using these statistics, you can calculate the average latency, CPU usage, or memory usage for each query group.
+The response also includes one example query from the query group.
+
+## Configuring query grouping
+
+Before you enable query grouping, you must enable top N query monitoring for a metric type of your choice. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+To configure grouping for top N queries, use the following steps.
+
+### Step 1: Enable top N query monitoring
+
+Ensure that top N query monitoring is enabled for at least one of the metrics: latency, CPU, or memory. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+For example, to enable top N query monitoring by latency with the default settings, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.latency.enabled" : true
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Step 2: Configure query grouping
+
+Set the desired grouping method by updating the following cluster setting:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.group_by" : "similarity"
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for the `group_by` setting is `none`, which disables grouping. As of OpenSearch 2.17, the supported values for `group_by` are `similarity` and `none`.
+
+### Step 3 (Optional): Limit the number of monitored query groups
+
+Optionally, you can limit the number of monitored query groups. Queries already included in the top N query list (the most resource-intensive queries) will not be considered in determining the limit. Essentially, the maximum applies only to other query groups, and the top N queries are tracked separately. This helps manage the tracking of query groups based on workload and query window size.
+
+To limit tracking to 100 query groups, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.max_groups_excluding_topn" : 100
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for `max_groups_excluding_topn` is `100`, and you can set it to any value between `0` and `10,000`, inclusive.
+
+## Monitoring query groups
+
+To view the top N query groups, send the following request:
+
+```json
+GET /_insights/top_queries
+```
+{% include copy-curl.html %}
+
+The response contains the top N query groups:
+
+
+
+ Response
+
+ {: .text-delta}
+
+```json
+{
+ "top_queries": [
+ {
+ "timestamp": 1725495127359,
+ "source": {
+ "query": {
+ "match_all": {
+ "boost": 1.0
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 55,
+ "fetch": 3
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "b4c4f69290df756021ca6276be5cbb75",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 30,
+ "parentTaskId": 29,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 33249000,
+ "memory_in_bytes": 2896848
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 29,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 3151000,
+ "memory_in_bytes": 133936
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 160,
+ "count": 10,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495135160,
+ "source": {
+ "query": {
+ "term": {
+ "content": {
+ "value": "first",
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 18,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "c3620cc3d4df30fb3f95aeb2167289a4",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 50,
+ "parentTaskId": 49,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 10188000,
+ "memory_in_bytes": 288136
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 49,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 262000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 109,
+ "count": 7,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495139766,
+ "source": {
+ "query": {
+ "match": {
+ "content": {
+ "query": "first",
+ "operator": "OR",
+ "prefix_length": 0,
+ "max_expansions": 50,
+ "fuzzy_transpositions": true,
+ "lenient": false,
+ "zero_terms_query": "NONE",
+ "auto_generate_synonyms_phrase_query": true,
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 15,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "484eaabecd13db65216b9e2ff5eee999",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 64,
+ "parentTaskId": 63,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 12161000,
+ "memory_in_bytes": 473456
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 63,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 293000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 43,
+ "count": 3,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ }
+ ]
+}
+```
+
+
+
+## Response fields
+
+The response includes the following fields.
+
+Field | Data type | Description
+:--- |:---| :---
+`top_queries` | Array | The list of top query groups.
+`top_queries.timestamp` | Integer | The execution timestamp for the first query in the query group.
+`top_queries.source` | Object | The first query in the query group.
+`top_queries.phase_latency_map` | Object | The phase latency map for the first query in the query group. The map includes the amount of time, in milliseconds, that the query spent in the `expand`, `query`, and `fetch` phases.
+`top_queries.total_shards` | Integer | The number of shards on which the first query was executed.
+`top_queries.node_id` | String | The node ID of the node that coordinated the execution of the first query in the query group.
+`top_queries.query_hashcode` | String | The hash code that uniquely identifies the query group. This is essentially the hash of the [query structure](#grouping-queries-by-similarity).
+`top_queries.task_resource_usages` | Array of objects | The resource usage breakdown for the various tasks belonging to the first query in the query group.
+`top_queries.indices` | Array | The indexes searched by the first query in the query group.
+`top_queries.labels` | Object | Used to label the top query.
+`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters).
+`top_queries.measurements` | Object | The aggregate measurements for the query group.
+`top_queries.measurements.latency` | Object | The aggregate latency measurements for the query group.
+`top_queries.measurements.latency.number` | Integer | The total latency for the query group.
+`top_queries.measurements.latency.count` | Integer | The number of queries in the query group.
+`top_queries.measurements.latency.aggregationType` | String | The aggregation type for the current entry. If grouping by similarity is enabled, then `aggregationType` is `AVERAGE`. If it is not enabled, then `aggregationType` is `NONE`.
\ No newline at end of file
diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md
index 549371240f..ef3a65bfcd 100644
--- a/_observing-your-data/query-insights/index.md
+++ b/_observing-your-data/query-insights/index.md
@@ -7,8 +7,10 @@ has_toc: false
---
# Query insights
+**Introduced 2.12**
+{: .label .label-purple }
-To monitor and analyze the search queries within your OpenSearch clusterQuery information, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
+To monitor and analyze the search queries within your OpenSearch cluster, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
Typical use cases for query insights features include the following:
@@ -36,4 +38,5 @@ For information about installing plugins, see [Installing plugins]({{site.url}}{
You can obtain the following information using Query Insights:
- [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/)
+- [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/)
- [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/)
diff --git a/_observing-your-data/query-insights/query-metrics.md b/_observing-your-data/query-insights/query-metrics.md
index c8caf21d65..beac8d4e18 100644
--- a/_observing-your-data/query-insights/query-metrics.md
+++ b/_observing-your-data/query-insights/query-metrics.md
@@ -2,10 +2,12 @@
layout: default
title: Query metrics
parent: Query insights
-nav_order: 20
+nav_order: 30
---
# Query metrics
+**Introduced 2.16**
+{: .label .label-purple }
Key query [metrics](#metrics), such as aggregation types, query types, latency, and resource usage per query type, are captured along the search path by using the OpenTelemetry (OTel) instrumentation framework. The telemetry data can be consumed using OTel metrics [exporters]({{site.url}}{{site.baseurl}}/observing-your-data/trace/distributed-tracing/#exporters).
diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md
index f07fd2dfef..b63d670926 100644
--- a/_observing-your-data/query-insights/top-n-queries.md
+++ b/_observing-your-data/query-insights/top-n-queries.md
@@ -7,7 +7,7 @@ nav_order: 10
# Top N queries
-Monitoring the top N queries in query insights features can help you gain real-time insights into the top queries with high latency within a certain time frame (for example, the last hour).
+Monitoring the top N queries using query insights allows you to gain real-time visibility into the queries with the highest latency or resource consumption in a specified time period (for example, the last hour).
## Configuring top N query monitoring
@@ -72,14 +72,14 @@ PUT _cluster/settings
## Monitoring the top N queries
-You can use the Insights API endpoint to obtain the top N queries for all metric types:
+You can use the Insights API endpoint to retrieve the top N queries. This API returns top N `latency` results by default.
```json
GET /_insights/top_queries
```
{% include copy-curl.html %}
-Specify a metric type to filter the response:
+Specify the `type` parameter to retrieve the top N results for other metric types. The results will be sorted in descending order based on the specified metric type.
```json
GET /_insights/top_queries?type=latency
@@ -96,6 +96,9 @@ GET /_insights/top_queries?type=memory
```
{% include copy-curl.html %}
+If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size).
+{: .important}
+
## Exporting top N query data
You can configure your desired exporter to export top N query data to different sinks, allowing for better monitoring and analysis of your OpenSearch queries. Currently, the following exporters are supported:
diff --git a/_query-dsl/geo-and-xy/geo-bounding-box.md b/_query-dsl/geo-and-xy/geo-bounding-box.md
index 1112a4278e..66fcc224d6 100644
--- a/_query-dsl/geo-and-xy/geo-bounding-box.md
+++ b/_query-dsl/geo-and-xy/geo-bounding-box.md
@@ -173,11 +173,11 @@ GET testindex1/_search
```
{% include copy-curl.html %}
-## Request fields
+## Parameters
-Geo-bounding box queries accept the following fields.
+Geo-bounding box queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geodistance.md b/_query-dsl/geo-and-xy/geodistance.md
index b272cad81e..3eef58bc69 100644
--- a/_query-dsl/geo-and-xy/geodistance.md
+++ b/_query-dsl/geo-and-xy/geodistance.md
@@ -103,11 +103,11 @@ The response contains the matching document:
}
```
-## Request fields
+## Parameters
-Geodistance queries accept the following fields.
+Geodistance queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`distance` | String | The distance within which to match the points. This distance is the radius of a circle centered at the specified point. For supported distance units, see [Distance units]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#distance-units). Required.
diff --git a/_query-dsl/geo-and-xy/geopolygon.md b/_query-dsl/geo-and-xy/geopolygon.md
index 980a0c5a63..810e48f2b7 100644
--- a/_query-dsl/geo-and-xy/geopolygon.md
+++ b/_query-dsl/geo-and-xy/geopolygon.md
@@ -161,11 +161,11 @@ However, if you specify the vertices in the following order:
The response returns no results.
-## Request fields
+## Parameters
-Geopolygon queries accept the following fields.
+Geopolygon queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Optional. Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geoshape.md b/_query-dsl/geo-and-xy/geoshape.md
index 42948666f4..5b144b06d6 100644
--- a/_query-dsl/geo-and-xy/geoshape.md
+++ b/_query-dsl/geo-and-xy/geoshape.md
@@ -25,15 +25,15 @@ Relation | Description | Supporting geographic field type
## Defining the shape in a geoshape query
-You can define the shape to filter documents in a geoshape query either by providing a new shape definition at query time or by referencing the name of a shape pre-indexed in another index.
+You can define the shape to filter documents in a geoshape query either by [providing a new shape definition at query time](#using-a-new-shape-definition) or by [referencing the name of a shape pre-indexed in another index](#using-a-pre-indexed-shape-definition).
-### Using a new shape definition
+## Using a new shape definition
To provide a new shape to a geoshape query, define it in the `geo_shape` field. You must define the geoshape in [GeoJSON format](https://geojson.org/).
The following example illustrates searching for documents containing geoshapes that match a geoshape defined at query time.
-#### Step 1: Create an index
+### Step 1: Create an index
First, create an index and map the `location` field as a `geo_shape`:
@@ -422,7 +422,7 @@ GET /testindex/_search
Geoshape queries whose geometry collection contains a linestring or a multilinestring do not support the `WITHIN` relation.
{: .note}
-### Using a pre-indexed shape definition
+## Using a pre-indexed shape definition
When constructing a geoshape query, you can also reference the name of a shape pre-indexed in another index. Using this method, you can define a geoshape at index time and refer to it by name at search time.
@@ -721,10 +721,10 @@ The response returns document 1:
Note that when you indexed the geopoints, you specified their coordinates in `"latitude, longitude"` format. When you search for matching documents, the coordinate array is in `[longitude, latitude]` format. Thus, document 1 is returned in the results but document 2 is not.
-## Request fields
+## Parameters
-Geoshape queries accept the following fields.
+Geoshape queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`ignore_unmapped` | Boolean | Specifies whether to ignore an unmapped field. If set to `true`, then the query does not return any documents that contain an unmapped field. If set to `false`, then an exception is thrown when the field is unmapped. Optional. Default is `false`.
\ No newline at end of file
diff --git a/_query-dsl/joining/has-child.md b/_query-dsl/joining/has-child.md
new file mode 100644
index 0000000000..c7da5bf7a9
--- /dev/null
+++ b/_query-dsl/joining/has-child.md
@@ -0,0 +1,398 @@
+---
+layout: default
+title: Has child
+parent: Joining queries
+nav_order: 10
+---
+
+# Has child query
+
+The `has_child` query returns parent documents whose child documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_child` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching child documents pointing to different parent documents increases. Each `has_child` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_child` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+In this example, you'll configure an index that contains documents representing products and their brands.
+
+First, create the index and establish the parent/child relationship between `brand` and `product`:
+
+```json
+PUT testindex1
+{
+ "mappings": {
+ "properties": {
+ "product_to_brand": {
+ "type": "join",
+ "relations": {
+ "brand": "product"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Index two parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+Index three child (product) documents:
+
+```json
+PUT testindex1/_doc/3?routing=1
+{
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/4?routing=2
+{
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/5?routing=2
+{
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search for the parent of a child, use a `has_child` query. The following query returns parent documents (brands) that make watches:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns both brands:
+
+```json
+{
+ "took": 15,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return child documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains child documents in the `inner_hits` field:
+
+```json
+{
+ "took": 52,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.53899646,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_child` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on child documents. If a child document matches the query, the parent document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
+| `max_children` | Optional | The maximum number of matching child documents for a parent document. If exceeded, the parent document is excluded from the search results. |
+| `min_children` | Optional | The minimum number of matching child documents required for a parent document to be included in the results. If not met, the parent is excluded. Default is `1`.|
+| `score_mode` | Optional | Defines how scores of matching child documents influence the parent document's score. Valid values are:
- `none`: Ignores the relevance scores of child documents and assigns a score of `0` to the parent document.
- `avg`: Uses the average relevance score of all matching child documents.
- `max`: Assigns the highest relevance score from the matching child documents to the parent.
- `min`: Assigns the lowest relevance score from the matching child documents to the parent.
- `sum`: Sums the relevance scores of all matching child documents.
Default is `none`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (child documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_child` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort parent documents by fields in their child documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the parent document's score.
+
+In the preceding example, you can sort parent documents (brands) based on the `sales_count` of their child products. This query multiplies the score by the `sales_count` field of the child documents and assigns the highest relevance score from the matching child documents to the parent:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_child": {
+ "type": "product",
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['sales_count'].value"
+ }
+ }
+ },
+ "score_mode": "max"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the brands sorted by the highest child `sales_count`:
+
+```json
+{
+ "took": 6,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 300,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 300,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 150,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/has-parent.md b/_query-dsl/joining/has-parent.md
new file mode 100644
index 0000000000..6b293ffff2
--- /dev/null
+++ b/_query-dsl/joining/has-parent.md
@@ -0,0 +1,358 @@
+---
+layout: default
+title: Has parent
+parent: Joining queries
+nav_order: 20
+---
+
+# Has parent query
+
+The `has_parent` query returns child documents whose parent documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_parent` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching parent documents increases. Each `has_parent` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_parent` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for the child of a parent, use a `has_parent` query. The following query returns child documents (products) made by the brand matching the query `economy`:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns all products made by the brand:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return parent documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains parent documents in the `inner_hits` field:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_parent` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `parent_type` | Required | Specifies the name of the parent relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on parent documents. If a parent document matches the query, the child document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `parent_type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `parent_type` field. Default is `false`. |
+| `score` | Optional | Indicates whether the relevance score of a matching parent document is aggregated into its child documents. If `false`, then the relevance score of the parent document is ignored, and each child document is assigned a relevance score equal to the query's `boost`, which defaults to `1`. If `true`, then the relevance score of the matching parent document is aggregated into the relevance scores of its child documents. Default is `false`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (parent documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_parent` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort child documents by fields in their parent documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the child document's score.
+
+For the preceding example, first add a `customer_satisfaction` field by which you'll sort the child documents belonging to the parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 4.5
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 3.9
+}
+```
+{% include copy-curl.html %}
+
+Now you can sort child documents (products) based on the `customer_satisfaction` field of their parent brands. This query multiplies the score by the `customer_satisfaction` field of the parent documents:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_parent": {
+ "parent_type": "brand",
+ "score": true,
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['customer_satisfaction'].value"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the products, sorted by the highest parent `customer_satisfaction`:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 3,
+ "relation": "eq"
+ },
+ "max_score": 4.5,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 4.5,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/index.md b/_query-dsl/joining/index.md
index 20f48c0b16..f0a0060640 100644
--- a/_query-dsl/joining/index.md
+++ b/_query-dsl/joining/index.md
@@ -3,16 +3,22 @@ layout: default
title: Joining queries
has_children: true
nav_order: 55
+has_toc: false
+redirect_from:
+ - /query-dsl/joining/
---
# Joining queries
OpenSearch is a distributed system in which data is spread across multiple nodes. Thus, running a SQL-like JOIN operation in OpenSearch is resource intensive. As an alternative, OpenSearch provides the following queries that perform join operations and are optimized for scaling across multiple nodes:
-- `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
-- `has_child` queries: Search for parent documents whose child documents match the query.
-- `has_parent` queries: Search for child documents whose parent documents match the query.
-- `parent_id` queries: A [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field type establishes a parent/child relationship between documents in the same index. `parent_id` queries search for child documents that are joined to a specific parent document.
+
+- Queries for searching [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields:
+ - `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
+- Queries for searching documents connected by a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type, which establishes a parent/child relationship between documents in the same index:
+ - [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries: Search for parent documents whose child documents match the query.
+ - [`has_parent`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-parent/) queries: Search for child documents whose parent documents match the query.
+ - [`parent_id`]({{site.url}}{{site.baseurl}}/query-dsl/joining/parent-id/) queries: Search for child documents that are joined to a specific parent document.
If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then joining queries are not executed.
{: .important}
\ No newline at end of file
diff --git a/_query-dsl/joining/nested.md b/_query-dsl/joining/nested.md
new file mode 100644
index 0000000000..431a40ed1a
--- /dev/null
+++ b/_query-dsl/joining/nested.md
@@ -0,0 +1,347 @@
+---
+layout: default
+title: Nested
+parent: Joining queries
+nav_order: 30
+---
+
+# Nested query
+
+The `nested` query acts as a wrapper for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. If an object matches the search, the `nested` query returns the parent document at the root level.
+
+## Example
+
+Before you can run a `nested` query, your index must contain a [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field.
+
+To configure an example index containing nested fields, send the following request:
+
+```json
+PUT /testindex
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "age": {
+ "type": "integer"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /testindex/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, wrap your query in a `nested` query and provide the `path` to the nested field:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the matching document:
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return inner hits that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the additional `inner_hits` field. The `_nested` field identifies the specific inner object from which the inner hit originated. It contains the nested hit and the offset relative to its position in the `_source`. Because of sorting and scoring, the position of the hit objects in `inner_hits` often differs from their original location in the nested object.
+
+By default, the `_source` of the hit objects within `inner_hits` is returned relative to the `_nested` field. In this example, the `_source` within `inner_hits` contains the `name` and `age` fields as opposed to the top-level `_source`, which contains the whole `patient` object:
+
+```json
+{
+ "took": 38,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ },
+ "inner_hits": {
+ "patient": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_nested": {
+ "field": "patient",
+ "offset": 0
+ },
+ "_score": 0.2876821,
+ "_source": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+You can disable returning `_source` by configuring the `_source` field in the mappings. For more information, see [Source]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/source/).
+{: .tip}
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Multi-level nested queries
+
+You can search documents that have nested objects inside other nested objects using multi-level nested queries. In this example, you'll query multiple layers of nested fields by specifying a nested query for each level of the hierarchy.
+
+First, create an index with multi-level nested fields:
+
+```json
+PUT /patients
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "contacts": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "relationship": {
+ "type": "text"
+ },
+ "phone": {
+ "type": "keyword"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /patients/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, use a multi-level `nested` query. The following query searches for patients whose contact information includes a person named `Jane` with a relationship of `mother`:
+
+```json
+GET /patients/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "nested": {
+ "path": "patient.contacts",
+ "query": {
+ "bool": {
+ "must": [
+ { "match": { "patient.contacts.relationship": "mother" } },
+ { "match": { "patient.contacts.name": "Jane" } }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the patient who has a contact entry matching these details:
+
+```json
+{
+ "took": 14,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "patients",
+ "_id": "1",
+ "_score": 1.3862942,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `nested` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `path` | Required | Specifies the path to the nested object that you want to search. |
+| `query` | Required | The query to run on the nested objects within the specified `path`. If a nested object matches the query, the root parent document is returned. You can search nested fields using dot notation, such as `nested_object.subfield`. Multi-level nesting is supported and automatically detected. Thus, an inner `nested` query within another nested query automatically matches the correct nesting level, instead of the root. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `path` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `path` field. Default is `false`. |
+| `score_mode` | Optional | Defines how scores of matching inner documents influence the parent document's score. Valid values are:
- `avg`: Uses the average relevance score of all matching inner documents.
- `max`: Assigns the highest relevance score from the matching inner documents to the parent.
- `min`: Assigns the lowest relevance score from the matching inner documents to the parent.
- `sum`: Sums the relevance scores of all matching inner documents.
- `none`: Ignores the relevance scores of inner documents and assigns a score of `0` to the parent document.
Default is `avg`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits that matched the query. |
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/parent-id.md b/_query-dsl/joining/parent-id.md
new file mode 100644
index 0000000000..cbf86a796e
--- /dev/null
+++ b/_query-dsl/joining/parent-id.md
@@ -0,0 +1,96 @@
+---
+layout: default
+title: Parent ID
+parent: Joining queries
+nav_order: 40
+---
+
+# Parent ID query
+
+The `parent_id` query returns child documents whose parent document has the specified ID. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+## Example
+
+Before you can run a `parent_id` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for child documents of a specific parent document, use a `parent_id` query. The following query returns child documents (products) whose parent document has the ID `1`:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "parent_id": {
+ "type": "product",
+ "id": "1"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns the child product:
+
+```json
+{
+ "took": 57,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.87546873,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.87546873,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `parent_id` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `id` | Required | The ID of the parent document. The query returns child documents associated with this parent document. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
\ No newline at end of file
diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md
index 14b930cdb6..6cd534b87f 100644
--- a/_query-dsl/specialized/neural.md
+++ b/_query-dsl/specialized/neural.md
@@ -35,6 +35,8 @@ Field | Data type | Required/Optional | Description
`min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines.
+`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters).
+`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision).
#### Example request
diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md
index 42c74c0436..7dac6a9619 100644
--- a/_query-dsl/term/terms.md
+++ b/_query-dsl/term/terms.md
@@ -39,6 +39,7 @@ Parameter | Data type | Description
:--- | :--- | :---
`` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`.
## Terms lookup
@@ -250,3 +251,136 @@ Parameter | Data type | Description
`path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required.
`routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+
+## Bitmap filtering
+**Introduced 2.17**
+{: .label .label-purple }
+
+The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering.
+
+The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products.
+
+First, create a `products` index and map `product_id` as a `keyword`:
+
+```json
+PUT /products
+{
+ "mappings": {
+ "properties": {
+ "product_id": { "type": "keyword" }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index three documents that correspond to products:
+
+```json
+PUT students/_doc/1
+{
+ "name": "Product 1",
+ "product_id" : "111"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/2
+{
+ "name": "Product 2",
+ "product_id" : "222"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/3
+{
+ "name": "Product 3",
+ "product_id" : "333"
+}
+```
+{% include copy-curl.html %}
+
+To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field:
+
+```json
+PUT /customers
+{
+ "mappings": {
+ "properties": {
+ "customer_filter": {
+ "type": "binary",
+ "store": true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`.
+
+To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme:
+
+```py
+from pyroaring import BitMap
+import base64
+
+# Create a bitmap, serialize it into a byte string, and encode into Base64
+bm = BitMap([111, 222, 333]) # product ids owned by a customer
+encoded = base64.b64encode(BitMap.serialize(bm))
+
+# Convert the Base64-encoded bytes to a string for storage or transmission
+encoded_bm_str = encoded.decode('utf-8')
+
+# Print the encoded bitmap
+print(f"Encoded Bitmap: {encoded_bm_str}")
+```
+{% include copy.html %}
+
+Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer:
+
+```json
+POST customers/_doc/customer123
+{
+ "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ=="
+}
+```
+{% include copy-curl.html %}
+
+Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": {
+ "index": "customers",
+ "id": "customer123",
+ "path": "customer_filter",
+ "store": true
+ },
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==",
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
\ No newline at end of file
diff --git a/_sass/color_schemes/odfe.scss b/_sass/color_schemes/odfe.scss
deleted file mode 100644
index f9b2ca02ba..0000000000
--- a/_sass/color_schemes/odfe.scss
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// Brand colors
-//
-
-$white: #FFFFFF;
-
-$grey-dk-300: #241F21; // Error
-$grey-dk-250: mix(white, $grey-dk-300, 12.5%);
-$grey-dk-200: mix(white, $grey-dk-300, 25%);
-$grey-dk-100: mix(white, $grey-dk-300, 50%);
-$grey-dk-000: mix(white, $grey-dk-300, 75%);
-
-$grey-lt-300: #DBDBDB; // Cloud
-$grey-lt-200: mix(white, $grey-lt-300, 25%);
-$grey-lt-100: mix(white, $grey-lt-300, 50%);
-$grey-lt-000: mix(white, $grey-lt-300, 75%);
-
-$blue-300: #00007C; // Meta
-$blue-200: mix(white, $blue-300, 25%);
-$blue-100: mix(white, $blue-300, 50%);
-$blue-000: mix(white, $blue-300, 75%);
-
-$purple-300: #9600FF; // Prpl
-$purple-200: mix(white, $purple-300, 25%);
-$purple-100: mix(white, $purple-300, 50%);
-$purple-000: mix(white, $purple-300, 75%);
-
-$green-300: #00671A; // Element
-$green-200: mix(white, $green-300, 25%);
-$green-100: mix(white, $green-300, 50%);
-$green-000: mix(white, $green-300, 75%);
-
-$yellow-300: #FFDF00; // Kan-Banana
-$yellow-200: mix(white, $yellow-300, 25%);
-$yellow-100: mix(white, $yellow-300, 50%);
-$yellow-000: mix(white, $yellow-300, 75%);
-
-$red-300: #BD145A; // Ruby
-$red-200: mix(white, $red-300, 25%);
-$red-100: mix(white, $red-300, 50%);
-$red-000: mix(white, $red-300, 75%);
-
-$blue-lt-300: #0000FF; // Cascade
-$blue-lt-200: mix(white, $blue-lt-300, 25%);
-$blue-lt-100: mix(white, $blue-lt-300, 50%);
-$blue-lt-000: mix(white, $blue-lt-300, 75%);
-
-/*
-Other, unused brand colors
-
-Float #2797F4
-Firewall #0FF006B
-Hyper Pink #F261A1
-Cluster #ED20EB
-Back End #808080
-Python #25EE5C
-Warm Node #FEA501
-*/
-
-$body-background-color: $white;
-$sidebar-color: $grey-lt-000;
-$code-background-color: $grey-lt-000;
-
-$body-text-color: $grey-dk-200;
-$body-heading-color: $grey-dk-300;
-$nav-child-link-color: $grey-dk-200;
-$link-color: mix(black, $blue-lt-300, 37.5%);
-$btn-primary-color: $purple-300;
-$base-button-color: $grey-lt-000;
-
-// $border-color: $grey-dk-200;
-// $search-result-preview-color: $grey-dk-000;
-// $search-background-color: $grey-dk-250;
-// $table-background-color: $grey-dk-250;
-// $feedback-color: darken($sidebar-color, 3%);
diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss
index 3a9dcc5e6d..b3ee3c3775 100755
--- a/_sass/custom/custom.scss
+++ b/_sass/custom/custom.scss
@@ -1039,14 +1039,25 @@ body {
display: flex;
align-items: flex-start;
justify-content: center;
- gap: 20px;
- margin: 0 auto;
+ gap: 0;
+ border-top: 1px solid #eeebee;
+ flex-direction: column;
+ @include mq(md) {
+ flex-direction: row;
+ gap: 20px
+ }
}
.search-page--sidebar {
- flex: 1;
- max-width: 200px;
- flex: 0 0 200px;
+ max-width: 100%;
+ order: 2;
+ margin-top: 1rem;
+ color: $grey-dk-300;
+ @include mq(md) {
+ flex: 1;
+ max-width: 200px;
+ margin-top: 3rem;
+ }
}
.search-page--sidebar--category-filter--checkbox-child {
@@ -1054,52 +1065,96 @@ body {
}
.search-page--results {
- flex: 3;
display: flex;
flex-direction: column;
align-items: center;
- max-width: 60%;
+ width: 100%;
+ max-width: 100%;
+ order: 3;
+ @include mq(md) {
+ flex: 3;
+ max-width: 60%;
+ }
}
-.search-page--results--input {
- width: 100%;
+.search-page--results--wrapper {
position: relative;
+ display: flex;
+ width: 100%;
+ background-color: white;
+ margin: 0 auto 2rem;
+ max-width: 800px;
}
.search-page--results--input-box {
width: 100%;
- padding: 10px;
- margin-bottom: 20px;
- border: 1px solid #ccc;
+ padding: 10px 40px 10px 10px;
+ border: 1px solid $grey-lt-300;
border-radius: 4px;
+ color: $grey-dk-300;
}
.search-page--results--input-icon {
position: absolute;
- top: 35%;
- right: 10px;
- transform: translateY(-50%);
+ right: 12px;
+ align-self: center;
pointer-events: none;
- color: #333;
+ color: $grey-dk-000;
}
-.search-page--results--diplay {
+.search-page--results--display {
width: 100%;
position: relative;
flex-flow: column nowrap;
+ margin-top: 1rem;
+ @media (max-width: $content-width) {
+ margin-top: 0.5rem;
+ }
}
-.search-page--results--diplay--header {
+.search-page--results--display--header {
text-align: center;
- margin-bottom: 20px;
background-color: transparent;
+ color: $grey-dk-300;
+ margin-bottom: 1rem;
+ margin-top: 1.5rem;
+ padding-bottom: 1rem;
+ border-bottom: 1px solid $blue-dk-100;
+ font-size: 20px;
+ @include mq(md) {
+ font-size: 1.5rem;
+ }
}
-.search-page--results--diplay--container--item {
- margin-bottom: 1%;
+.search-page--results--display--container--item {
+ margin-bottom: 2rem;
display: block;
}
+.search-page--results--no-results {
+ padding: 1rem;
+ display: block;
+ font-size: 1rem;
+ font-weight: normal;
+}
+
+.search-page--results--display--container--item--link {
+ font-family: "Open Sans Condensed", Impact, "Franklin Gothic Bold", sans-serif;
+ font-size: 1.2rem;
+ font-weight: bold;
+ display: block;
+ text-decoration: underline;
+ text-underline-offset: 5px;
+ text-decoration-color: $grey-lt-300;
+ &:hover {
+ text-decoration-color: $blue-100;
+ }
+}
+
+.category-checkbox {
+ margin-right: 4px;
+}
+
@mixin body-text($color: #000) {
color: $color;
font-family: 'Open Sans';
diff --git a/_search-plugins/collapse-search.md b/_search-plugins/collapse-search.md
new file mode 100644
index 0000000000..ec7e57515a
--- /dev/null
+++ b/_search-plugins/collapse-search.md
@@ -0,0 +1,231 @@
+---
+layout: default
+title: Collapse search results
+nav_order: 3
+---
+
+# Collapse search results
+
+The `collapse` parameter groups search results by a particular field value. This returns only the top document within each group, which helps reduce redundancy by eliminating duplicates.
+
+The `collapse` parameter requires the field being collapsed to be of either a `keyword` or a `numeric` type.
+
+---
+
+## Collapsing search results
+
+To populate an index with data, define the index mappings and an `item` field indexed as a `keyword`. The following example request shows you how to define index mappings, populate an index, and then search it.
+
+#### Define index mappings
+
+```json
+PUT /bakery-items
+{
+ "mappings": {
+ "properties": {
+ "item": {
+ "type": "keyword"
+ },
+ "category": {
+ "type": "keyword"
+ },
+ "price": {
+ "type": "float"
+ },
+ "baked_date": {
+ "type": "date"
+ }
+ }
+ }
+}
+```
+
+#### Populate an index
+
+```json
+POST /bakery-items/_bulk
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" }
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-04T00:00:00Z" }
+{ "index": {} }
+{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" }
+```
+
+#### Search the index, returning all results
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "sort": ["price"]
+}
+```
+
+This query returns the uncollapsed search results, showing all documents, including both entries for "Chocolate Cake".
+
+#### Search the index and collapse the results
+
+To group search results by the `item` field and sort them by `price`, you can use the following query:
+
+**Collapsed `item` field search results**
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item"
+ },
+ "sort": ["price"]
+}
+```
+
+**Response**
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 4,
+ "relation": "eq"
+ },
+ "max_score": null,
+ "hits": [
+ {
+ "_index": "bakery-items",
+ "_id": "mISga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Vanilla Cake",
+ "category": "cakes",
+ "price": 12,
+ "baked_date": "2023-07-02T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Vanilla Cake"
+ ]
+ },
+ "sort": [
+ 12
+ ]
+ },
+ {
+ "_index": "bakery-items",
+ "_id": "loSga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Chocolate Cake",
+ "category": "cakes",
+ "price": 15,
+ "baked_date": "2023-07-01T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Chocolate Cake"
+ ]
+ },
+ "sort": [
+ 15
+ ]
+ }
+ ]
+ }
+}
+```
+
+The collapsed search results will show only one "Chocolate Cake" entry, demonstrating how the `collapse` parameter reduces redundancy.
+
+The `collapse` parameter affects only the top search results and does not change any aggregation results. The total number of hits shown in the response reflects all matching documents before the parameter is applied, including duplicates. However, the response doesn't indicate the exact number of unique groups formed by the operation.
+
+---
+
+## Expanding collapsed results
+
+You can expand each collapsed top hit with the `inner_hits` property.
+
+The following example request applies `inner_hits` to retrieve the lowest-priced and most recent item, for each type of cake:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 1,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+```
+
+### Multiple inner hits for each collapsed hit
+
+To obtain several groups of inner hits for each collapsed result, you can set different criteria for each group. For example, lets request the three most recent items for every bakery item:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 3,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+
+```
+This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order.
+
+You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size.
+
diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md
index cbbb993ac9..80614e2fff 100644
--- a/_search-plugins/concurrent-segment-search.md
+++ b/_search-plugins/concurrent-segment-search.md
@@ -22,6 +22,8 @@ Without concurrent segment search, Lucene executes a request sequentially across
## Enabling concurrent segment search at the index or cluster level
+Starting with OpenSearch version 2.17, you can use the `search.concurrent_segment_search.mode` setting to configure concurrent segment search on your cluster. The existing `search.concurrent_segment_search.enabled` setting will be deprecated in future version releases in favor of the new setting.
+
By default, concurrent segment search is disabled on the cluster. You can enable concurrent segment search at two levels:
- Cluster level
@@ -30,8 +32,37 @@ By default, concurrent segment search is disabled on the cluster. You can enable
The index-level setting takes priority over the cluster-level setting. Thus, if the cluster setting is enabled but the index setting is disabled, then concurrent segment search will be disabled for that index. Because of this, the index-level setting is not evaluated unless it is explicitly set, regardless of the default value configured for the setting. You can retrieve the current value of the index-level setting by calling the [Index Settings API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/get-settings/) and omitting the `?include_defaults` query parameter.
{: .note}
-To enable concurrent segment search for all indexes in the cluster, set the following dynamic cluster setting:
+Both the cluster- and index-level `search.concurrent_segment_search.mode` settings accept the following values:
+
+- `all`: Enables concurrent segment search across all search requests. This is equivalent to setting `search.concurrent_segment_search.enabled` to `true`.
+
+- `none`: Disables concurrent segment search for all search requests, effectively turning off the feature. This is equivalent to setting `search.concurrent_segment_search.enabled` to `false`. This is the **default** behavior.
+
+- `auto`: In this mode, OpenSearch will use the pluggable _concurrent search decider_ to decide whether to use a concurrent or sequential path for the search request based on the query evaluation and the presence of aggregations in the request. By default, if there are no deciders configured by any plugin, then the decision to use concurrent search will be made based on the presence of aggregations in the request. For more information about the pluggable decider semantics, see [Pluggable concurrent search deciders](#pluggable-concurrent-search-deciders-concurrentsearchrequestdecider).
+
+To enable concurrent segment search for all search requests across every index in the cluster, send the following request:
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.mode": "all"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To enable concurrent segment search for all search requests on a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.mode": "all"
+}
+```
+{% include copy-curl.html %}
+
+You can continue to use the existing `search.concurrent_segment_search.enabled` setting to enable concurrent segment search for all indexes in the cluster as follows:
```json
PUT _cluster/settings
{
@@ -52,6 +83,35 @@ PUT /_settings
```
{% include copy-curl.html %}
+
+When evaluating whether concurrent segment search is enabled on a cluster, the `search.concurrent_segment_search.mode` setting takes precedence over the `search.concurrent_segment_search.enabled` setting.
+If the `search.concurrent_segment_search.mode` setting is not explicitly set, then the `search.concurrent_segment_search.enabled` setting will be evaluated to determine whether to enable concurrent segment search.
+
+When upgrading a cluster from an earlier version that specifies the older `search.concurrent_segment_search.enabled` setting, this setting will continue to be honored. However, once the `search.concurrent_segment_search.mode` is set, it will override the previous setting, enabling or disabling concurrent search based on the specified mode.
+We recommend setting `search.concurrent_segment_search.enabled` to `null` on your cluster once you configure `search.concurrent_segment_search.mode`:
+
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.enabled": null
+ }
+}
+```
+{% include copy-curl.html %}
+
+To disable the old setting for a particular index, specify the index name in the endpoint:
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.enabled": null
+}
+```
+{% include copy-curl.html %}
+
+
+
+
## Slicing mechanisms
You can choose one of two available mechanisms for assigning segments to slices: the default [Lucene mechanism](#the-lucene-mechanism) or the [max slice count mechanism](#the-max-slice-count-mechanism).
@@ -66,7 +126,10 @@ The _max slice count_ mechanism is an alternative slicing mechanism that uses a
### Setting the slicing mechanism
-By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request. To use the max slice count mechanism instead, configure the `search.concurrent.max_slice_count` cluster setting:
+By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request.
+To use the max slice count mechanism instead, you can set the slice count for concurrent segment search at either the cluster level or index level.
+
+To configure the slice count for all indexes in a cluster, use the following dynamic cluster setting:
```json
PUT _cluster/settings
@@ -78,7 +141,17 @@ PUT _cluster/settings
```
{% include copy-curl.html %}
-The `search.concurrent.max_slice_count` setting can take the following valid values:
+To configure the slice count for a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent.max_slice_count": 2
+}
+```
+{% include copy-curl.html %}
+
+Both the cluster- and index-level `search.concurrent.max_slice_count` settings can take the following valid values:
- `0`: Use the default Lucene mechanism.
- Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient.
@@ -117,8 +190,20 @@ Non-concurrent search calculates the document count error and returns it in the
For more information about how `shard_size` can affect both `doc_count_error_upper_bound` and collected buckets, see [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11680#issuecomment-1885882985).
-## Developer information: AggregatorFactory changes
+## Developer information
+
+The following sections provide additional information for developers.
+
+### AggregatorFactory changes
+
+Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+
+To ensure that a custom plugin-based `Aggregator` implementation functions with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method to return `true`.
+
+### Pluggable concurrent search deciders: ConcurrentSearchRequestDecider
-Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+Introduced 2.17
+{: .label .label-purple }
-To ensure that a custom plugin-based `Aggregator` implementation works with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method to return `true`.
+Plugin developers can customize the concurrent search decision-making for `auto` mode by extending [`ConcurrentSearchRequestDecider`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java) and registering its factory through [`SearchPlugin#getConcurrentSearchRequestFactories()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/plugins/SearchPlugin.java#L148). The deciders are evaluated only if a request does not belong to any category listed in the [Limitations](#limitations) and [Other considerations](#other-considerations) sections. For more information about the decider implementation, see [the corresponding GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/15259).
+The search request is parsed using a `QueryBuilderVisitor`, which calls the [`ConcurrentSearchRequestDecider#evaluateForQuery()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L36) method of all the configured deciders for every node of the `QueryBuilder` tree in the search request. The final concurrent search decision is obtained by combining the decision from each decider returned by the [`ConcurrentSearchRequestDecider#getConcurrentSearchDecision()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L44) method.
\ No newline at end of file
diff --git a/_search-plugins/knn/api.md b/_search-plugins/knn/api.md
index c7314f7ae2..d927bf1c35 100644
--- a/_search-plugins/knn/api.md
+++ b/_search-plugins/knn/api.md
@@ -185,7 +185,7 @@ This API operation only works with indexes created using the `nmslib` and `faiss
The following request evicts the native library indexes of three indexes from the cache:
```json
-GET /_plugins/_knn/clear_cache/index1,index2,index3?pretty
+POST /_plugins/_knn/clear_cache/index1,index2,index3?pretty
{
"_shards" : {
"total" : 6,
@@ -200,7 +200,7 @@ The `total` parameter indicates the number of shards that the API attempted to c
The k-NN clear cache API can be used with index patterns to clear one or more indexes that match the given pattern from the cache, as shown in the following example:
```json
-GET /_plugins/_knn/clear_cache/index*?pretty
+POST /_plugins/_knn/clear_cache/index*?pretty
{
"_shards" : {
"total" : 6,
@@ -234,7 +234,7 @@ Response field | Description
`timestamp` | The date and time when the model was created.
`description` | A user-provided description of the model.
`error` | An error message explaining why the model is in a failed state.
-`space_type` | The space type for which this model is trained, for example, Euclidean or cosine.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note - this value can be set in the top-level of the request as well
`dimension` | The dimensionality of the vector space for which this model is designed.
`engine` | The native library used to create the model, either `faiss` or `nmslib`.
@@ -351,6 +351,7 @@ Request parameter | Description
`search_size` | The training data is pulled from the training index using scroll queries. This parameter defines the number of results to return per scroll query. Default is `10000`. Optional.
`description` | A user-provided description of the model. Optional.
`method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [k-NN index method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). The method requires training to be valid.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note: This value can also be set in the `method` parameter.
#### Usage
@@ -365,10 +366,10 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
@@ -395,10 +396,10 @@ POST /_plugins/_knn/models/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md
index e9cff8562f..f8921033e0 100644
--- a/_search-plugins/knn/approximate-knn.md
+++ b/_search-plugins/knn/approximate-knn.md
@@ -49,9 +49,9 @@ PUT my-knn-index-1
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -62,9 +62,9 @@ PUT my-knn-index-1
"my_vector2": {
"type": "knn_vector",
"dimension": 4,
+ "space_type": "innerproduct",
"method": {
"name": "hnsw",
- "space_type": "innerproduct",
"engine": "faiss",
"parameters": {
"ef_construction": 256,
@@ -199,10 +199,10 @@ POST /_plugins/_knn/models/my-model/_train
"training_field": "train-field",
"dimension": 4,
"description": "My model description",
+ "space_type": "l2",
"method": {
"name": "ivf",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"nlist": 4,
"nprobes": 2
@@ -308,6 +308,72 @@ Engine | Notes
:--- | :---
`faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index.
+### Rescoring quantized results using full precision
+
+Quantization can be used to significantly reduce the memory footprint of a k-NN index. For more information about quantization, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization). Because some vector representation is lost during quantization, the computed distances will be approximate. This causes the overall recall of the search to decrease.
+
+To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k.
+
+The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field:
+
+- For `in_memory` mode, no rescoring is applied by default.
+- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table.
+
+| Compression level | Default rescore `oversample_factor` |
+|:------------------|:----------------------------------|
+| `32x` (default) | 3.0 |
+| `16x` | 2.0 |
+| `8x` | 2.0 |
+| `4x` | No default rescoring |
+| `2x` | No default rescoring |
+
+To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : {
+ "oversample_factor": 1.2
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Alternatively, set the `rescore` parameter to `true` to use a default `oversample_factor` of `1.0`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000.
+
+Rescoring is only supported for the `faiss` engine.
+
+Rescoring is not needed if quantization is not used because the scores returned are already fully precise.
+{: .note}
+
### Using approximate k-NN with filters
To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/).
@@ -322,7 +388,7 @@ To learn more about the radial search feature, see [k-NN radial search]({{site.u
### Using approximate k-NN with binary vectors
-To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## Spaces
@@ -346,5 +412,5 @@ The cosine similarity formula does not include the `1 -` prefix. However, becaus
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_search-plugins/knn/disk-based-vector-search.md
new file mode 100644
index 0000000000..dfb9262db5
--- /dev/null
+++ b/_search-plugins/knn/disk-based-vector-search.md
@@ -0,0 +1,193 @@
+---
+layout: default
+title: Disk-based vector search
+nav_order: 16
+parent: k-NN search
+has_children: false
+---
+
+# Disk-based vector search
+**Introduced 2.17**
+{: .label .label-purple}
+
+For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall.
+
+To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage.
+
+## Creating an index for disk-based vector search
+
+To create an index for disk-based vector search, send the following request:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk.
+
+To reduce the compression level, provide the `compression_level` parameter when creating the index mapping:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used.
+{: .note}
+
+If you need more granular fine-tuning, you can override additional k-NN parameters in the method definition. For example, to improve recall, increase the `ef_construction` parameter value:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "method": {
+ "params": {
+ "ef_construction": 512
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `on_disk` mode only works with the `float` data type.
+{: .note}
+
+## Ingestion
+
+You can perform document ingestion for a disk-optimized vector index in the same way as for a regular vector index. To index several documents in bulk, send the following request:
+
+```json
+POST _bulk
+{ "index": { "_index": "my-vector-index", "_id": "1" } }
+{ "my_vector_field": [1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5], "price": 12.2 }
+{ "index": { "_index": "my-vector-index", "_id": "2" } }
+{ "my_vector_field": [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5], "price": 7.1 }
+{ "index": { "_index": "my-vector-index", "_id": "3" } }
+{ "my_vector_field": [3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5], "price": 12.9 }
+{ "index": { "_index": "my-vector-index", "_id": "4" } }
+{ "my_vector_field": [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5], "price": 1.2 }
+{ "index": { "_index": "my-vector-index", "_id": "5" } }
+{ "my_vector_field": [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], "price": 3.7 }
+{ "index": { "_index": "my-vector-index", "_id": "6" } }
+{ "my_vector_field": [6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5], "price": 10.3 }
+{ "index": { "_index": "my-vector-index", "_id": "7" } }
+{ "my_vector_field": [7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5], "price": 5.5 }
+{ "index": { "_index": "my-vector-index", "_id": "8" } }
+{ "my_vector_field": [8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5], "price": 4.4 }
+{ "index": { "_index": "my-vector-index", "_id": "9" } }
+{ "my_vector_field": [9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5], "price": 8.9 }
+```
+{% include copy-curl.html %}
+
+## Search
+
+Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results using full precision]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). To perform vector search on a disk-optimized index, provide the search vector:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Similarly to other index configurations, you can override k-NN parameters in the search request:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5,
+ "method_parameters": {
+ "ef_search": 512
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+[Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/) does not support disk-based vector search.
+{: .note}
+
+## Model-based indexes
+
+For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#supported-faiss-methods) and a compression level of `32x`. To run the training API, send the following request:
+
+```json
+POST /_plugins/_knn/models/test-model/_train
+{
+ "training_index": "train-index-name",
+ "training_field": "train-field-name",
+ "dimension": 8,
+ "max_training_vector_count": 1200,
+ "search_size": 100,
+ "description": "My model",
+ "space_type": "innerproduct",
+ "mode": "on_disk"
+}
+```
+{% include copy-curl.html %}
+
+This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model).
+{: .note}
+
+You can override the `compression_level` for disk-optimized indexes in the same way as for regular k-NN indexes.
+
+
+## Next steps
+
+- For more information about binary quantization, see [Binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization).
+- For more information about k-NN vector workload modes, see [Vector workload modes]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes).
\ No newline at end of file
diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md
index a6ffd922eb..620b262cf9 100644
--- a/_search-plugins/knn/knn-index.md
+++ b/_search-plugins/knn/knn-index.md
@@ -25,9 +25,9 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 128,
@@ -41,17 +41,17 @@ PUT /test-index
```
{% include copy-curl.html %}
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine to reduce the amount of storage space needed. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with k-NN plugin version 2.17, you can use `byte` vectors with the `faiss` and `lucene` engines to reduce the amount of required memory and storage space. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
-## Binary vector
+## Binary vectors
-Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## SIMD optimization for the Faiss engine
-Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency.
+Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, the k-NN plugin supports AVX512 SIMD instructions on x64 architecture.
SIMD optimization is applicable only if the vector dimension is a multiple of 8.
{: .note}
@@ -60,14 +60,22 @@ SIMD optimization is applicable only if the vector dimension is a multiple of 8.
### x64 architecture
-For the x64 architecture, two different versions of the Faiss library are built and shipped with the artifact:
+For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact:
- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions.
-- `libopensearchknn_faiss_avx2.so`: The Faiss library that contains AVX2 SIMD instructions.
+- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX512 SIMD instructions.
+- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions.
-If your hardware supports AVX2, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+When using the Faiss library, the performance ranking is as follows: AVX512 > AVX2 > no optimization.
+{: .note }
+
+If your hardware supports AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx512.so` library at runtime.
+
+If your hardware supports AVX2 but doesn't support AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+
+To disable the AVX512 and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512.disabled` and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, both of these are `false`).
-To disable AVX2 and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx2.disabled` static setting as `true` in `opensearch.yml` (default is `false`). Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
+Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
### ARM64 architecture
@@ -83,7 +91,7 @@ A method definition will always contain the name of the method, the space_type t
Mapping parameter | Required | Default | Updatable | Description
:--- | :--- | :--- | :--- | :---
`name` | true | n/a | false | The identifier for the nearest neighbor method.
-`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors.
+`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping.
`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene.
`parameters` | false | null | false | The parameters used for the nearest neighbor method.
@@ -116,7 +124,7 @@ Method name | Requires training | Supported spaces | Description
For hnsw, "innerproduct" is not available when PQ is used.
{: .note}
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
#### HNSW parameters
@@ -168,7 +176,6 @@ An index created in OpenSearch version 2.11 or earlier will still use the old `e
"method": {
"name":"hnsw",
"engine":"lucene",
- "space_type": "l2",
"parameters":{
"m":2048,
"ef_construction": 245
@@ -186,7 +193,6 @@ The following example method definition specifies the `hnsw` method and a `pq` e
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -232,7 +238,6 @@ The following example uses the `ivf` method without specifying an encoder (by d
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist": 4,
"nprobes": 2
@@ -246,7 +251,6 @@ The following example uses the `ivf` method with a `pq` encoder:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -265,7 +269,6 @@ The following example uses the `hnsw` method without specifying an encoder (by d
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"ef_construction": 256,
"m": 8
@@ -279,7 +282,6 @@ The following example uses the `hnsw` method with an `sq` encoder of type `fp16`
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -300,7 +302,6 @@ The following example uses the `ivf` method with an `sq` encoder of type `fp16`:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -324,7 +325,7 @@ If you want to use less memory and increase indexing speed as compared to HNSW w
If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop.
-You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
+You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
### Memory estimation
diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md
index d2fd883e74..a184de2d3d 100644
--- a/_search-plugins/knn/knn-score-script.md
+++ b/_search-plugins/knn/knn-score-script.md
@@ -302,5 +302,5 @@ Cosine similarity returns a number between -1 and 1, and because OpenSearch rele
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ... ]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md
index 656ce72fd2..a911dc91c9 100644
--- a/_search-plugins/knn/knn-vector-quantization.md
+++ b/_search-plugins/knn/knn-vector-quantization.md
@@ -11,15 +11,15 @@ has_math: true
By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization.
-OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, and product quantization (PQ).
+OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, product quantization (PQ), and binary quantization(BQ).
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the Lucene engine in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with version 2.17, the k-NN plugin supports `byte` vectors with the `faiss` and `lucene` engines in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
## Lucene scalar quantization
-Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike the [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector), which requires you to quantize vectors before ingesting the documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
+Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors.
@@ -40,10 +40,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -85,10 +85,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -115,7 +115,7 @@ In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use
#### HNSW memory estimation
-The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * M)` bytes/vector, where `M` is the maximum number of bidirectional links created for each element during the construction of the graph.
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
@@ -150,10 +150,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -194,10 +194,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -250,9 +250,9 @@ In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer
#### HNSW memory estimation
-The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * M)` bytes/vector.
+The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
-As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows:
```r
1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB
@@ -260,9 +260,9 @@ As an example, assume that you have 1 million vectors with a dimension of 256 an
#### IVF memory estimation
-The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * d))` bytes/vector.
+The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into.
-As an example, assume that you have 1 million vectors with a dimension of 256 and `nlist` of 128. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows:
```r
1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB
@@ -310,3 +310,175 @@ For example, assume that you have 1 million vectors with a dimension of 256, `iv
```r
1.1*((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB
```
+
+## Binary quantization
+
+Starting with version 2.17, OpenSearch supports BQ with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ.
+
+### Using BQ
+To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization:
+
+- **32x compression** for 1-bit quantization
+- **16x compression** for 2-bit quantization
+- **8x compression** for 4-bit quantization
+
+This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency.
+
+To specify the compression level, set the `compression_level` parameter:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x",
+ "method": {
+ "params": {
+ "ef_construction": 16
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`):
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "method": {
+ "name": "hnsw",
+ "engine": "faiss",
+ "space_type": "l2",
+ "params": {
+ "m": 16,
+ "ef_construction": 512,
+ "encoder": {
+ "name": "binary",
+ "parameters": {
+ "bits": 1
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Search using binary quantized vectors
+
+You can perform a k-NN search on your index by providing a vector and specifying the number of nearest neighbors (k) to return:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters.
+The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results.
+
+The following request specifies the `ef_search` and `oversample_factor` parameters:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10,
+ "method_parameters": {
+ "ef_search": 10
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+
+#### HNSW memory estimation
+
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
+
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values.
+
+##### 1-bit quantization (32x compression)
+
+In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000
+ ~= 0.176 GB
+```
+
+##### 2-bit quantization (16x compression)
+
+In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000
+ ~= 0.211 GB
+```
+
+##### 4-bit quantization (8x compression)
+
+In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000
+ ~= 0.282 GB
+```
diff --git a/_search-plugins/knn/nested-search-knn.md b/_search-plugins/knn/nested-search-knn.md
index d947ebc6e6..bbba6c9c1e 100644
--- a/_search-plugins/knn/nested-search-knn.md
+++ b/_search-plugins/knn/nested-search-knn.md
@@ -38,9 +38,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
@@ -324,9 +324,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md
index cc27776fc4..7a8d9fec7b 100644
--- a/_search-plugins/knn/painless-functions.md
+++ b/_search-plugins/knn/painless-functions.md
@@ -55,7 +55,7 @@ l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This functi
cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:
`float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)`
In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score.
hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance.
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
## Constraints
diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md
index 123b1daef1..77f44dee93 100644
--- a/_search-plugins/knn/performance-tuning.md
+++ b/_search-plugins/knn/performance-tuning.md
@@ -59,9 +59,9 @@ The `_source` field contains the original JSON document body that was passed at
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
@@ -85,9 +85,9 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
diff --git a/_search-plugins/knn/radial-search-knn.md b/_search-plugins/knn/radial-search-knn.md
index 1a4a223294..e5449a0993 100644
--- a/_search-plugins/knn/radial-search-knn.md
+++ b/_search-plugins/knn/radial-search-knn.md
@@ -53,9 +53,9 @@ PUT knn-index-test
"my_vector": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md
index 1b9aa3608c..e4731ec94c 100644
--- a/_search-plugins/knn/settings.md
+++ b/_search-plugins/knn/settings.md
@@ -27,6 +27,7 @@ Setting | Static/Dynamic | Default | Description
`knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability.
`knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap.
`knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
+`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
## Index settings
diff --git a/_search-plugins/search-pipelines/using-search-pipeline.md b/_search-plugins/search-pipelines/using-search-pipeline.md
index ecb988ad11..b6dbbdc5d0 100644
--- a/_search-plugins/search-pipelines/using-search-pipeline.md
+++ b/_search-plugins/search-pipelines/using-search-pipeline.md
@@ -17,14 +17,45 @@ You can use a search pipeline in the following ways:
## Specifying an existing search pipeline for a request
-After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query by specifying the pipeline name in the `search_pipeline` query parameter:
+After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query in the following ways. For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+
+### Specifying the pipeline in a query parameter
+
+You can specify the pipeline name in the `search_pipeline` query parameter as follows:
```json
GET /my_index/_search?search_pipeline=my_pipeline
```
{% include copy-curl.html %}
-For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+### Specifying the pipeline in the request body
+
+You can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /my-index/_search
+{
+ "query": {
+ "match_all": {}
+ },
+ "from": 0,
+ "size": 10,
+ "search_pipeline": "my_pipeline"
+}
+```
+{% include copy-curl.html %}
+
+For multi-search, you can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /_msearch
+{ "index": "test"}
+{ "query": { "match_all": {} }, "from": 0, "size": 10, "search_pipeline": "my_pipeline"}
+{ "index": "test-1", "search_type": "dfs_query_then_fetch"}
+{ "query": { "match_all": {} }, "search_pipeline": "my_pipeline1" }
+
+```
+{% include copy-curl.html %}
## Using a temporary search pipeline for a request
diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md
index 395e9e748a..38fc7a491d 100644
--- a/_search-plugins/searching-data/inner-hits.md
+++ b/_search-plugins/searching-data/inner-hits.md
@@ -139,8 +139,8 @@ The preceding query searches for nested user objects containing the name John an
}
}
```
-## Inner hits with parent-child objects
-Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent-child objects.
+## Inner hits with parent/child objects
+Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent/child objects.
1. Create an index with a parent-join field:
@@ -806,4 +806,8 @@ The following is the expected result:
Using `inner_hits` provides contextual relevance by showing exactly which nested or child documents match the query criteria. This is crucial for applications in which the relevance of results depends on a specific part of the document that matches the query.
- Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
\ No newline at end of file
+ Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
+
+## Next steps
+
+- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields.
\ No newline at end of file
diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md
index cd893f4144..f19030bf90 100644
--- a/_search-plugins/vector-search.md
+++ b/_search-plugins/vector-search.md
@@ -37,9 +37,9 @@ PUT test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 1024,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -57,7 +57,7 @@ PUT test-index
You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float.
-To save storage space, you can use `byte` or `binary` vectors. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector) and [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
### k-NN vector search
@@ -131,9 +131,9 @@ PUT /hotels-index
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md
index 366bc2674c..b26063bed0 100644
--- a/_security-analytics/threat-intelligence/getting-started.md
+++ b/_security-analytics/threat-intelligence/getting-started.md
@@ -50,15 +50,64 @@ Local files uploaded as the threat intelligence source must use the following sp
When using the `S3_SOURCE` as a remote store, the following connection information must be provided:
-- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role.
-- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored.
-- **Specify a directory or file**: The object key or directory path for the `STIX2` file in the S3 bucket.
+- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. When using the AWS OpenSearch Service, the role ARN needs to be in the same account as the OpenSearch domain. For more information about adding a new role for the AWS OpenSearch Service, see [Add service ARN](#add-aws-opensearch-service-arn).
+- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. To access an S3 bucket in a different AWS account, see the [Cross-account S3 bucket connection](#cross-account-s3-bucket-connection) section for more details.
+- **Specify a file**: The object key for the `STIX2` file in the S3 bucket.
- **Region**: The AWS Region for the S3 bucket.
You can also set the **Download schedule**, which determines to where OpenSearch downloads an updated `STIX2` file from the connected S3 bucket. The default interval is once a day. Only daily intervals are supported.
Alternatively, you can check the **Download on demand** option, which prevents new data from the bucket from being automatically downloaded.
+#### Add AWS OpenSearch Service ARN
+
+If you're using the AWS OpenSearch Service, create a new ARN role with a custom trust policy. For instructions on how to create the role, see [Creating a role for an AWS service](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-service.html#roles-creatingrole-service-console).
+
+When creating the role, customize the following settings:
+
+- Add the following custom trust policy:
+
+ ```bash
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "Service": [
+ "opensearchservice.amazonaws.com"
+ ]
+ },
+ "Action": "sts:AssumeRole"
+ }
+ ]
+ }
+ ```
+
+- On the Permissions policies page, add the `AmazonS3ReadOnlyAccess` permission.
+
+
+#### Cross-account S3 bucket connection
+
+Because the role ARN needs to be in the same account as the OpenSearch domain, a trust policy needs to be configured that allows the OpenSearch domain to download from S3 buckets from the same account.
+
+To download from an S3 bucket in another account, the trust policy for that bucket needs to give the role ARN permission to read from the object, as shown in the following example:
+
+```
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "AWS": "arn:aws:iam::123456789012:role/account-1-threat-intel-role"
+ },
+ "Action": "s3:*",
+ "Resource": "arn:aws:s3:::account-2-threat-intel-bucket/*"
+ }
+ ]
+}
+```
## Step 2: Set up scanning for your log sources
diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md
index 352fe06a61..b17b60e147 100644
--- a/_security/access-control/document-level-security.md
+++ b/_security/access-control/document-level-security.md
@@ -13,6 +13,8 @@ Document-level security lets you restrict a role to a subset of documents in an
![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png)
+The maximum size for the document-level security configuration is 1024 KB (1,048,404 characters).
+{: .warning}
## Simple roles
diff --git a/_security/audit-logs/index.md b/_security/audit-logs/index.md
index becb001ec0..8eeea33447 100644
--- a/_security/audit-logs/index.md
+++ b/_security/audit-logs/index.md
@@ -224,3 +224,36 @@ plugins.security.audit.config.threadpool.max_queue_len: 100000
To disable audit logs after they've been enabled, remove the `plugins.security.audit.type: internal_opensearch` setting from `opensearch.yml`, or switch off the **Enable audit logging** check box in OpenSearch Dashboards.
+## Audit user account manipulation
+
+To enable audit logging on changes to a security index, such as changes to roles mappings and role creation or deletion, use the following settings in the `compliance:` portion of the audit log configuration, as shown in the following example:
+
+```
+_meta:
+ type: "audit"
+ config_version: 2
+
+config:
+ # enable/disable audit logging
+ enabled: true
+
+ ...
+
+
+ compliance:
+ # enable/disable compliance
+ enabled: true
+
+ # Log updates to internal security changes
+ internal_config: true
+
+ # Log only metadata of the document for write events
+ write_metadata_only: false
+
+ # Log only diffs for document updates
+ write_log_diffs: true
+
+ # List of indices to watch for write events. Wildcard patterns are supported
+ # write_watched_indices: ["twitter", "logs-*"]
+ write_watched_indices: [".opendistro_security"]
+```
diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md
index 3f28dfecfd..6c7311e7dc 100644
--- a/_security/authentication-backends/jwt.md
+++ b/_security/authentication-backends/jwt.md
@@ -117,7 +117,7 @@ The following table lists the configuration parameters.
Name | Description
:--- | :---
-`signing_key` | The signing key to use when verifying the token. If you use a symmetric key algorithm, it is the base64-encoded shared secret. If you use an asymmetric algorithm, it contains the public key.
+`signing_key` | The signing key(s) used to verify the token. If you use a symmetric key algorithm, this is the Base64-encoded shared secret. If you use an asymmetric algorithm, the algorithm contains the public key. To pass multiple keys, use a comma-separated list or enumerate the keys.
`jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer `. Default is `Authorization`. Replacing this field with a value other than `Authorization` prevents the audit log from properly redacting the JWT header from audit messages. It is recommended that users only use `Authorization` when using JWTs with audit logging.
`jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here.
`subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used.
diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md
index 811fd2a69f..38bcc01cdd 100755
--- a/_security/configuration/disable-enable-security.md
+++ b/_security/configuration/disable-enable-security.md
@@ -155,22 +155,22 @@ Use the following steps to reinstall the plugin:
1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted:
- ```json
- curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
- "transient": {
- "cluster.routing.allocation.enable": "none"
- }
- }'
- ```
- {% include copy.html %}
+ ```json
+ curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
+ "transient": {
+ "cluster.routing.allocation.enable": "none"
+ }
+ }'
+ ```
+ {% include copy.html %}
2. Install the Security plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install):
- ```bash
- bin/opensearch-plugin install opensearch-security
- ```
- {% include copy.html %}
-
+ ```bash
+ bin/opensearch-plugin install opensearch-security
+ ```
+ {% include copy.html %}
+
3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See
[Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured.
diff --git a/_security/configuration/index.md b/_security/configuration/index.md
index e351e8865f..f68667d92d 100644
--- a/_security/configuration/index.md
+++ b/_security/configuration/index.md
@@ -3,7 +3,7 @@ layout: default
title: Configuration
nav_order: 2
has_children: true
-has_toc: false
+has_toc: true
redirect_from:
- /security-plugin/configuration/
- /security-plugin/configuration/index/
@@ -11,21 +11,105 @@ redirect_from:
# Security configuration
-The plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch in a production environment, you must configure it manually:
+The Security plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch with the Security plugin in a production environment, you must make changes to the demo certificates and other configuration options manually.
-1. [Replace the demo certificates]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
-1. [Reconfigure `opensearch.yml` to use your certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls).
-1. [Reconfigure `config.yml` to use your authentication backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/) (if you don't plan to use the internal user database).
-1. [Modify the configuration YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml).
-1. If you plan to use the internal user database, [set a password policy in `opensearch.yml`]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#opensearchyml).
-1. [Apply changes using the `securityadmin` script]({{site.url}}{{site.baseurl}}/security/configuration/security-admin).
-1. Start OpenSearch.
-1. [Add users, roles, role mappings, and tenants]({{site.url}}{{site.baseurl}}/security/access-control/index/).
+## Replace the demo certificates
-If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+OpenSearch ships with demo certificates intended for quick setup and demonstration purposes. For a production environment, it's critical to replace these with your own trusted certificates, using the following steps, to ensure secure communication:
-The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that use kibana in their names. We will change these names in a future release.
+1. **Generate your own certificates:** Use tools like OpenSSL or a certificate authority (CA) to generate your own certificates. For more information about generating certificates with OpenSSL, see [Generating self-signed certificates]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/).
+2. **Store the generated certificates and private key in the appropriate directory:** Generated certificates are typically stored in `/config/`. For more information, see [Add certificate files to opensearch.yml]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/#add-certificate-files-to-opensearchyml).
+3. **Set the following file permissions:**
+ - Private key (.key files): Set the file mode to `600`. This restricts access so that only the file owner (the OpenSearch user) can read and write to the file, ensuring that the private key remains secure and inaccessible to unauthorized users.
+ - Public certificates (.crt, .pem files): Set the file mode to `644`. This allows the file owner to read and write to the file, while other users can only read it.
+
+For additional guidance on file modes, see the following table.
+
+ | Item | Sample | Numeric | Bitwise |
+ |-------------|---------------------|---------|--------------|
+ | Public key | `~/.ssh/id_rsa.pub` | `644` | `-rw-r--r--` |
+ | Private key | `~/.ssh/id_rsa` | `600` | `-rw-------` |
+ | SSH folder | `~/.ssh` | `700` | `drwx------` |
+
+For more information, see [Configuring basic security settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
+
+## Reconfigure `opensearch.yml` to use your certificates
+
+The `opensearch.yml` file is the main configuration file for OpenSearch; you can find the file at `/config/opensearch.yml`. Use the following steps to update this file to point to your custom certificates:
+
+In `opensearch.yml`, set the correct paths for your certificates and keys, as shown in the following example:
+ ```
+ plugins.security.ssl.transport.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.transport.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.transport.pemtrustedcas_filepath: /path/to/your/ca.pem
+ plugins.security.ssl.http.enabled: true
+ plugins.security.ssl.http.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.http.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.http.pemtrustedcas_filepath: /path/to/your/ca.pem
+ ```
+For more information, see [Configuring TLS certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/).
+
+## Reconfigure `config.yml` to use your authentication backend
+
+The `config.yml` file allows you to configure the authentication and authorization mechanisms for OpenSearch. Update the authentication backend settings in `/config/opensearch-security/config.yml` according to your requirements.
+
+For example, to use LDAP as your authentication backend, add the following settings:
+
+ ```
+ authc:
+ basic_internal_auth:
+ http_enabled: true
+ transport_enabled: true
+ order: 1
+ http_authenticator:
+ type: basic
+ challenge: true
+ authentication_backend:
+ type: internal
+ ```
+For more information, see [Configuring the Security backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/).
+
+## Modify the configuration YAML files
+
+Determine whether any additional YAML files need modification, for example, the `roles.yml`, `roles_mapping.yml`, or `internal_users.yml` files. Update the files with any additional configuration information. For more information, see [Modifying the YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/).
+
+## Set a password policy
+
+When using the internal user database, we recommend enforcing a password policy to ensure that strong passwords are used. For information about strong password policies, see [Password settings]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#password-settings).
+
+## Apply changes using the `securityadmin` script
+
+The following steps do not apply to first-time users because the security index is automatically initialized from the YAML configuration files when OpenSearch starts.
+{: .note}
+
+After initial setup, if you make changes to your security configuration or disable automatic initialization by setting `plugins.security.allow_default_init_securityindex` to `false` (which prevents security index initialization from `yaml` files), you need to manually apply changes using the `securityadmin` script:
+
+1. Find the `securityadmin` script. The script is typically stored in the OpenSearch plugins directory, `plugins/opensearch-security/tools/securityadmin.[sh|bat]`.
+ - Note: If you're using OpenSearch 1.x, the `securityadmin` script is located in the `plugins/opendistro_security/tools/` directory.
+ - For more information, see [Basic usage](https://opensearch.org/docs/latest/security/configuration/security-admin/#basic-usage).
+2. Run the script by using the following command:
+ ```
+ ./plugins/opensearch-security/tools/securityadmin.[sh|bat]
+ ```
+3. Check the OpenSearch logs and configuration to ensure that the changes have been successfully applied.
+
+For more information about using the `securityadmin` script, see [Applying changes to configuration files]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/).
+
+## Add users, roles, role mappings, and tenants
+
+If you don't want to use the Security plugin, you can disable it by adding the following setting to the `opensearch.yml` file:
+
+```
+plugins.security.disabled: true
+```
+
+You can then enable the plugin by removing the `plugins.security.disabled` setting.
+
+For more information about disabling the Security plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+
+The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that contain "Kibana" in their names. We will change these names in a future version.
{: .note }
-For a full list of `opensearch.yml` Security plugin settings, Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
+For a full list of `opensearch.yml` Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
{: .note}
+
diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md
index 1686c8332e..2694e3a24f 100644
--- a/_security/configuration/yaml.md
+++ b/_security/configuration/yaml.md
@@ -265,7 +265,7 @@ kibana_server:
## roles.yml
-This file contains any initial roles that you want to add to the Security plugin. Aside from some metadata, the default file is empty, because the Security plugin has a number of static roles that it adds automatically.
+This file contains any initial roles that you want to add to the Security plugin. By default, this file contains predefined roles that grant usage to plugins within the default distribution of OpenSearch. The Security plugin will also add a number static roles automatically.
```yml
---
diff --git a/_tools/index.md b/_tools/index.md
index 108f10da97..c9d446a81a 100644
--- a/_tools/index.md
+++ b/_tools/index.md
@@ -18,6 +18,7 @@ This section provides documentation for OpenSearch-supported tools, including:
- [OpenSearch CLI](#opensearch-cli)
- [OpenSearch Kubernetes operator](#opensearch-kubernetes-operator)
- [OpenSearch upgrade, migration, and comparison tools](#opensearch-upgrade-migration-and-comparison-tools)
+- [Sycamore](#sycamore) for AI-powered extract, transform, load (ETL) on complex documents for vector and hybrid search
For information about Data Prepper, the server-side data collector for filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/index/).
@@ -122,3 +123,9 @@ The OpenSearch Kubernetes Operator is an open-source Kubernetes operator that he
OpenSearch migration tools facilitate migrations to OpenSearch and upgrades to newer versions of OpenSearch. These can help you can set up a proof-of-concept environment locally using Docker containers or deploy to AWS using a one-click deployment script. This empowers you to fine-tune cluster configurations and manage workloads more effectively before migration.
For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0).
+
+## Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using an [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+For more information, see [Sycamore]({{site.url}}{{site.baseurl}}/tools/sycamore/).
diff --git a/_tools/sycamore.md b/_tools/sycamore.md
new file mode 100644
index 0000000000..9b3986dbf3
--- /dev/null
+++ b/_tools/sycamore.md
@@ -0,0 +1,48 @@
+---
+layout: default
+title: Sycamore
+nav_order: 210
+has_children: false
+---
+
+# Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using a connector like the [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+To get started, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
+
+## Sycamore ETL pipeline structure
+
+A Sycamore extract, transform, load (ETL) pipeline applies a series of transformations to a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets), which is a collection of documents and their constituent elements (for example, tables, blocks of text, or headers). At the end of the pipeline, the DocSet is loaded into OpenSearch vector and keyword indexes.
+
+A typical pipeline for preparing unstructured data for vector or hybrid search in OpenSearch consists of the following steps:
+
+* Read documents into a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets).
+* [Partition documents](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/partition.html) into structured JSON elements.
+* Extract metadata and filter and clean data using [transforms](https://sycamore.readthedocs.io/en/stable/sycamore/APIs/docset.html).
+* Create [chunks](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/merge.html) from groups of elements.
+* Embed the chunks using the model of your choice.
+* [Load](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html) the embeddings, metadata, and text into OpenSearch vector and keyword indexes.
+
+For an example pipeline that uses this workflow, see [this notebook](https://github.com/aryn-ai/sycamore/blob/main/notebooks/opensearch_docs_etl.ipynb).
+
+
+## Install Sycamore
+
+We recommend installing the Sycamore library using `pip`. The connector for OpenSearch can be specified and installed using extras. For example:
+
+```bash
+pip install sycamore-ai[opensearch]
+```
+{% include copy.html %}
+
+By default, Sycamore works with the Aryn Partitioning Service to process PDFs. To run inference locally for partitioning or embedding, install Sycamore with the `local-inference` extra as follows:
+
+```bash
+pip install sycamore-ai[opensearch,local-inference]
+```
+{% include copy.html %}
+
+## Next steps
+
+For more information, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
diff --git a/_troubleshoot/tls.md b/_troubleshoot/tls.md
index 93e9a2c490..6c777ad5b8 100644
--- a/_troubleshoot/tls.md
+++ b/_troubleshoot/tls.md
@@ -207,7 +207,7 @@ plugins.security.ssl.http.enabled_protocols:
TLS relies on the server and client negotiating a common cipher suite. Depending on your system, the available ciphers will vary. They depend on the JDK or OpenSSL version you're using, and whether or not the `JCE Unlimited Strength Jurisdiction Policy Files` are installed.
-For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html). If you don't have them installed, you might see an error message on startup:
+For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/java/technologies/javase-jce8-downloads.html). If you don't have them installed, you might see an error message on startup:
```
[INFO ] AES-256 not supported, max key length for AES is 128 bit.
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
index d967aca914..03cd1716f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
@@ -67,10 +67,14 @@ The remote cluster state functionality has the following limitations:
## Remote cluster state publication
-
The cluster manager node processes updates to the cluster state. It then publishes the updated cluster state through the local transport layer to all of the follower nodes. With the `remote_store.publication` feature enabled, the cluster state is backed up to the remote store during every state update. The follower nodes can then fetch the state from the remote store directly, which reduces the overhead on the cluster manager node for publication.
-To enable the feature flag for the `remote_store.publication` feature, follow the steps in the [experimental feature flag documentation]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/).
+To enable this feature, configure the following setting in `opensearch.yml`:
+
+```yml
+# Enable Remote cluster state publication
+cluster.remote_store.publication.enabled: true
+```
Enabling the setting does not change the publication flow, and follower nodes will not send acknowledgements back to the cluster manager node
until they download the updated cluster state from the remote store.
@@ -89,8 +93,11 @@ You do not have to use different remote store repositories for state and routing
To configure remote publication, use the following cluster settings.
-Setting | Default | Description
-:--- | :--- | :---
-`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for remote state download to complete on the follower node.
-`cluster.remote_store.routing_table.path_type` | HASHED_PREFIX | The path type to be used for creating an index routing path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
-`cluster.remote_store.routing_table.path_hash_algo` | FNV_1A_BASE64 | The algorithm to be used for constructing the prefix or infix of the blob store path. This setting is applied if `cluster.remote_store.routing_table.path_type` is `hashed_prefix` or `hashed_infix`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+Setting | Default | Description
+:--- |:---| :---
+`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for the remote state download to complete on the follower node.
+`cluster.remote_store.state.path.prefix` | "" (Empty string) | The fixed prefix to add to the index metadata files in the blob store.
+`cluster.remote_store.index_metadata.path_type` | `HASHED_PREFIX` | The path type used for creating an index metadata path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
+`cluster.remote_store.index_metadata.path_hash_algo` | `FNV_1A_BASE64 ` | The algorithm that constructs the prefix or infix for the index metadata path in the blob store. This setting is applied if the ``cluster.remote_store.index_metadata.path_type` setting is `HASHED_PREFIX` or `HASHED_INFIX`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+`cluster.remote_store.routing_table.path.prefix` | "" (Empty string) | The fixed prefix to add for the index routing files in the blob store.
+
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
index 0415af65f1..e93f504be3 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
@@ -27,7 +27,7 @@ PUT /_snapshot/snap_repo
```
{% include copy-curl.html %}
-Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. After the setting is enabled, we recommend not disabling the setting. Doing so could affect data durability.
+Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. Therefore, do not disable the shallow snapshot setting after it has been enabled because disabling the setting could affect data durability.
## Considerations
@@ -37,3 +37,43 @@ Consider the following before using shallow copy snapshots:
- All nodes in the cluster must use OpenSearch 2.10 or later to take advantage of shallow copy snapshots.
- The `incremental` file count and size between the current snapshot and the last snapshot is `0` when using shallow copy snapshots.
- Searchable snapshots are not supported inside shallow copy snapshots.
+
+## Shallow snapshot v2
+
+Starting with OpenSearch 2.17, the shallow snapshot feature offers an improved version called `shallow snapshot v2`, which aims to makes snapshot operations more efficient and scalable by introducing the following enhancements:
+
+* Deterministic snapshot operations: Shallow snapshot v2 makes snapshot operations more deterministic, ensuring consistent and predictable behavior.
+* Minimized cluster state updates: Shallow snapshot v2 minimizes the number of cluster state updates required during snapshot operations, reducing overhead and improving performance.
+* Scalability: Shallow snapshot v2 allows snapshot operations to scale independently of the number of shards in the cluster, enabling better performance and efficiency for large datasets.
+
+Shallow snapshot v2 must be enabled separately from shallow copies.
+
+### Enabling shallow snapshot v2
+
+To enable shallow snapshot v2, enable the following repository settings:
+
+- `remote_store_index_shallow_copy: true`
+- `shallow_snapshot_v2: true`
+
+The following example request creates a shallow snapshot v2 repository:
+
+```bash
+PUT /_snapshot/snap_repo
+{
+"type": "s3",
+"settings": {
+"bucket": "test-bucket",
+"base_path": "daily-snaps",
+"remote_store_index_shallow_copy": true,
+"shallow_snapshot_v2": true
+}
+}
+```
+{% include copy-curl.html %}
+
+### Limitations
+
+Shallow snapshot v2 has the following limitations:
+
+* Shallow snapshot v2 only supported for remote-backed indexes.
+* All nodes in the cluster must use OpenSearch 2.17 or later to take advantage of shallow snapshot v2.
diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
index b9e35b2697..d13955f3f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
+++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
@@ -18,7 +18,7 @@ The searchable snapshot feature incorporates techniques like caching frequently
To configure the searchable snapshots feature, create a node in your `opensearch.yml file` and define the node role as `search`. Optionally, you can also configure the `cache.size` property for the node.
-A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
+A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
Parameter | Type | Description
:--- | :--- | :---
diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md
index 99db78565f..fa0973395f 100644
--- a/_tuning-your-cluster/index.md
+++ b/_tuning-your-cluster/index.md
@@ -192,11 +192,27 @@ To better understand and monitor your cluster, use the [CAT API]({{site.url}}{{s
## (Advanced) Step 6: Configure shard allocation awareness or forced awareness
+To further fine-tune your shard allocation, you can set custom node attributes for shard allocation awareness or forced awareness.
+
### Shard allocation awareness
-If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone that’s different from their primary shard.
+You can set custom node attributes on OpenSearch nodes to be used for shard allocation awareness. For example, you can set the `zone` attribute on each node to represent the zone in which the node is located. You can also use the `zone` attribute to ensure that the primary shard and its replica shards are allocated in a balanced manner across available, distinct zones. In this scenario, maximum shard copies per zone would equal `ceil (number_of_shard_copies/number_of_distinct_zones)`.
+
+OpenSearch, by default, allocates shard copies of a single shard across different nodes. When only 1 zone is available, such as after a zone failure, OpenSearch allocates replica shards to the only remaining zone---it considers only available zones (attribute values) when calculating the maximum number of allowed shard copies per zone.
+
+For example, if your index has a total of 5 shard copies (1 primary and 4 replicas) and nodes in 3 distinct zones, then OpenSearch will perform the following to allocate all 5 shard copies:
+
+- Allocate no more than 2 shards per zone, which will require at least 2 nodes in 2 zones.
+- Allocate the last shard in the third zone, with at least 1 node needed in the third zone.
-With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures.
+Alternatively, if you have 3 nodes in the first zone and 1 node in each remaining zone, then OpenSearch will allocate:
+
+- 2 shard copies in the first zone.
+- 1 shard copy in the remaining 2 zones.
+
+The final shard copy will remain unallocated due to the lack of nodes.
+
+With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones, adding a layer of fault tolerance to ensure that your data survives zone failures.
To configure shard allocation awareness, add zone attributes to `opensearch-d1` and `opensearch-d2`, respectively:
@@ -219,6 +235,8 @@ PUT _cluster/settings
}
```
+You can also use multiple attributes for shard allocation awareness by providing the attributes as a comma-separated string, for example, `zone,rack`.
+
You can either use `persistent` or `transient` settings. We recommend the `persistent` setting because it persists through a cluster reboot. Transient settings don't persist through a cluster reboot.
Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone.
diff --git a/_tuning-your-cluster/replication-plugin/auto-follow.md b/_tuning-your-cluster/replication-plugin/auto-follow.md
index 828b835387..92e7a6c144 100644
--- a/_tuning-your-cluster/replication-plugin/auto-follow.md
+++ b/_tuning-your-cluster/replication-plugin/auto-follow.md
@@ -98,9 +98,9 @@ To delete a replication rule, send the following request to the follower cluster
```bash
curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d '
{
- "leader_alias" : "my-conection-alias",
+ "leader_alias" : "my-connection-alias",
"name": "my-replication-rule"
}'
```
-When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
\ No newline at end of file
+When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
diff --git a/assets/examples/ecommerce.json b/assets/examples/ecommerce.ndjson
similarity index 100%
rename from assets/examples/ecommerce.json
rename to assets/examples/ecommerce.ndjson
diff --git a/assets/js/search.js b/assets/js/search.js
index 8d9cab2ec5..86970d9544 100644
--- a/assets/js/search.js
+++ b/assets/js/search.js
@@ -173,7 +173,10 @@
const showNoResults = () => {
emptyResults();
- elResults.appendChild(document.createRange().createContextualFragment('No results found!'));
+ const resultElement = document.createElement('div');
+ resultElement.classList.add('search-page--results--no-results');
+ resultElement.appendChild(document.createRange().createContextualFragment('No results found.'));
+ elResults.appendChild(resultElement);
showResults();
elSpinner?.classList.remove(CLASSNAME_SPINNING);
};
@@ -278,8 +281,6 @@
window.doResultsPageSearch = async (query, type, version) => {
- console.log("Running results page search!");
-
const searchResultsContainer = document.getElementById('searchPageResultsContainer');
try {
@@ -291,7 +292,7 @@ window.doResultsPageSearch = async (query, type, version) => {
if (data.results && data.results.length > 0) {
data.results.forEach(result => {
const resultElement = document.createElement('div');
- resultElement.classList.add('search-page--results--diplay--container--item');
+ resultElement.classList.add('search-page--results--display--container--item');
const contentCite = document.createElement('cite');
const crumbs = [...result.ancestors];
@@ -302,11 +303,9 @@ window.doResultsPageSearch = async (query, type, version) => {
const titleLink = document.createElement('a');
titleLink.href = result.url;
+ titleLink.classList.add('search-page--results--display--container--item--link');
titleLink.textContent = result.title;
- titleLink.style.fontSize = '1.5em';
- titleLink.style.fontWeight = 'bold';
- titleLink.style.display = 'block';
-
+
const contentSpan = document.createElement('span');
contentSpan.textContent = result.content;
contentSpan.style.display = 'block';
@@ -317,16 +316,10 @@ window.doResultsPageSearch = async (query, type, version) => {
// Append the result element to the searchResultsContainer
searchResultsContainer.appendChild(resultElement);
-
- const breakline = document.createElement('hr');
- breakline.style.border = '.5px solid #ccc';
- breakline.style.margin = 'auto';
- searchResultsContainer.appendChild(breakline);
});
} else {
const noResultsElement = document.createElement('div');
noResultsElement.textContent = 'No results found.';
- noResultsElement.style.fontSize = '2em';
searchResultsContainer.appendChild(noResultsElement);
}
} catch (error) {
diff --git a/build.sh b/build.sh
index 060bbfa666..85ef617931 100755
--- a/build.sh
+++ b/build.sh
@@ -1,3 +1,9 @@
#!/usr/bin/env bash
-JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host localhost --port 4000 --incremental --livereload --open-url --trace
+host="localhost"
+
+if [[ "$DOCKER_BUILD" == "true" ]]; then
+ host="0.0.0.0"
+fi
+
+JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host ${host} --port 4000 --incremental --livereload --open-url --trace
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000000..04dd007db9
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,14 @@
+version: "3"
+
+services:
+ doc_builder:
+ image: ruby:3.2.4
+ volumes:
+ - .:/app
+ working_dir: /app
+ ports:
+ - "4000:4000"
+ command: bash -c "bundler install && bash build.sh"
+ environment:
+ BUNDLE_PATH: /app/vendor/bundle # Avoid installing gems globally.
+ DOCKER_BUILD: true # Signify build.sh to bind to 0.0.0.0 for effective doc access from host.
diff --git a/release-notes/opensearch-documentation-release-notes-2.17.0.md b/release-notes/opensearch-documentation-release-notes-2.17.0.md
new file mode 100644
index 0000000000..d9ed51737c
--- /dev/null
+++ b/release-notes/opensearch-documentation-release-notes-2.17.0.md
@@ -0,0 +1,36 @@
+# OpenSearch Documentation Website 2.17.0 Release Notes
+
+The OpenSearch 2.17.0 documentation includes the following additions and updates.
+
+## New documentation for 2.17.0
+
+- Get offline batch inference details using task API in m [#8305](https://github.com/opensearch-project/documentation-website/pull/8305)
+- Documentation for Binary Quantization Support with KNN Vector Search [#8281](https://github.com/opensearch-project/documentation-website/pull/8281)
+- add offline batch ingestion tech doc [#8251](https://github.com/opensearch-project/documentation-website/pull/8251)
+- Add documentation changes for disk-based k-NN [#8246](https://github.com/opensearch-project/documentation-website/pull/8246)
+- Derived field updates for 2.17 [#8244](https://github.com/opensearch-project/documentation-website/pull/8244)
+- Add changes for multiple signing keys [#8243](https://github.com/opensearch-project/documentation-website/pull/8243)
+- Add documentation changes for Snapshot Status API [#8235](https://github.com/opensearch-project/documentation-website/pull/8235)
+- Update flow framework additional fields in previous_node_inputs [#8233](https://github.com/opensearch-project/documentation-website/pull/8233)
+- Add documentation changes for shallow snapshot v2 [#8207](https://github.com/opensearch-project/documentation-website/pull/8207)
+- Add documentation for context and ABC templates [#8197](https://github.com/opensearch-project/documentation-website/pull/8197)
+- Create documentation for snapshots with hashed prefix path type [#8196](https://github.com/opensearch-project/documentation-website/pull/8196)
+- Adding documentation for remote index use in AD [#8191](https://github.com/opensearch-project/documentation-website/pull/8191)
+- Doc update for concurrent search [#8181](https://github.com/opensearch-project/documentation-website/pull/8181)
+- Adding new cluster search setting docs [#8180](https://github.com/opensearch-project/documentation-website/pull/8180)
+- Add new settings for remote publication [#8176](https://github.com/opensearch-project/documentation-website/pull/8176)
+- Grouping Top N queries documentation [#8173](https://github.com/opensearch-project/documentation-website/pull/8173)
+- Document reprovision param for Update Workflow API [#8172](https://github.com/opensearch-project/documentation-website/pull/8172)
+- Add documentation for Faiss byte vector [#8170](https://github.com/opensearch-project/documentation-website/pull/8170)
+- Terms query can accept encoded terms input as bitmap [#8133](https://github.com/opensearch-project/documentation-website/pull/8133)
+- Update doc for adding new param in cat shards action for cancellation… [#8127](https://github.com/opensearch-project/documentation-website/pull/8127)
+- Add docs on skip_validating_missing_parameters in ml-commons connector [#8118](https://github.com/opensearch-project/documentation-website/pull/8118)
+- Add Split Response Processor to 2.17 Search Pipeline docs [#8081](https://github.com/opensearch-project/documentation-website/pull/8081)
+- Added documentation for FGAC for Flow Framework [#8076](https://github.com/opensearch-project/documentation-website/pull/8076)
+- Remove composite agg limitations for concurrent search [#7904](https://github.com/opensearch-project/documentation-website/pull/7904)
+- Add doc for nodes stats search.request.took fields [#7887](https://github.com/opensearch-project/documentation-website/pull/7887)
+- Add documentation for ignore_hosts config option for ip-based rate limiting [#7859](https://github.com/opensearch-project/documentation-website/pull/7859)
+
+## Documentation for 2.17.0 experimental features
+
+- Document new experimental ingestion streaming APIs [#8123](https://github.com/opensearch-project/documentation-website/pull/8123)
document.addEventListener('DOMContentLoaded', function() {
const categoryAll = document.getElementById('categoryAll');
const categoryDocumentation = document.getElementById('categoryDocumentation');
- const categoryNews = document.getElementById('categoryNews');
+ const categoryBlog = document.getElementById('categoryBlog');
+ const categoryEvent = document.getElementById('categoryEvent');
const searchInput = document.getElementById('searchPageInput');
function updateAllCheckbox() {
- if (categoryDocumentation.checked && categoryNews.checked) {
+ if (categoryDocumentation.checked && categoryBlog.checked && categoryEvent.checked) {
categoryAll.checked = true;
} else {
categoryAll.checked = false;
@@ -153,10 +166,12 @@
function updateChildCheckboxes() {
if (categoryAll.checked) {
categoryDocumentation.checked = true;
- categoryNews.checked = true;
+ categoryBlog.checked = true;
+ categoryEvent.checked = true;
} else {
categoryDocumentation.checked = false;
- categoryNews.checked = false;
+ categoryBlog.checked = false;
+ categoryEvent.checked = false;
}
}
@@ -168,7 +183,11 @@
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
- categoryNews.addEventListener('change', () => {
+ categoryBlog.addEventListener('change', () => {
+ updateAllCheckbox();
+ triggerSearch(searchInput.value.trim());
+ });
+ categoryEvent.addEventListener('change', () => {
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
diff --git a/_ml-commons-plugin/api/async-batch-ingest.md b/_ml-commons-plugin/api/async-batch-ingest.md
new file mode 100644
index 0000000000..ace95ba4d4
--- /dev/null
+++ b/_ml-commons-plugin/api/async-batch-ingest.md
@@ -0,0 +1,97 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+parent: ML Commons APIs
+has_children: false
+has_toc: false
+nav_order: 35
+---
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+Use the Asynchronous Batch Ingestion API to ingest data into your OpenSearch cluster from your files on remote file servers, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. For detailed configuration steps, see [Asynchronous batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/async-batch-ingestion/).
+
+## Path and HTTP methods
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+```
+
+#### Request fields
+
+The following table lists the available request fields.
+
+Field | Data type | Required/Optional | Description
+:--- | :--- | :---
+`index_name`| String | Required | The index name.
+`field_map` | Object | Required | Maps fields from the source file to specific fields in an OpenSearch index for ingestion.
+`ingest_fields` | Array | Optional | Lists fields from the source file that should be ingested directly into the OpenSearch index without any additional mapping.
+`credential` | Object | Required | Contains the authentication information for accessing external data sources, such as Amazon S3 or OpenAI.
+`data_source` | Object | Required | Specifies the type and location of the external file(s) from which the data is ingested.
+`data_source.type` | String | Required | Specifies the type of the external data source. Valid values are `s3` and `openAI`.
+`data_source.source` | Array | Required | Specifies one or more file locations from which the data is ingested. For `s3`, specify the file path to the Amazon S3 bucket (for example, `["s3://offlinebatch/output/sagemaker_batch.json.out"]`). For `openAI`, specify the file IDs for input or output files (for example, `["file-", "file-", "file-"]`).
+
+## Example request: Ingesting a single file
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example request: Ingesting multiple files
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example response
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md
index 64790bb57f..625d58bb62 100644
--- a/_ml-commons-plugin/api/connector-apis/update-connector.md
+++ b/_ml-commons-plugin/api/connector-apis/update-connector.md
@@ -29,17 +29,20 @@ PUT /_plugins/_ml/connectors/
The following table lists the updatable fields. For more information about all connector fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters).
-| Field | Data type | Description |
-| :--- | :--- | :--- |
-| `name` | String | The name of the connector. |
-| `description` | String | A description of the connector. |
-| `version` | Integer | The version of the connector. |
-| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
+| Field | Data type | Description |
+| :--- |:------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | The name of the connector. |
+| `description` | String | A description of the connector. |
+| `version` | Integer | The connector version. |
+| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
| `credential` | JSON object | Defines any credential variables required in order to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `parameters.skip_validating_missing_parameters` | Boolean | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
+
+
#### Example request
diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md
index 7b06cfefe8..6acd926444 100644
--- a/_ml-commons-plugin/api/execute-algorithm.md
+++ b/_ml-commons-plugin/api/execute-algorithm.md
@@ -2,7 +2,7 @@
layout: default
title: Execute algorithm
parent: ML Commons APIs
-nav_order: 30
+nav_order: 37
---
# Execute algorithm
diff --git a/_ml-commons-plugin/api/model-apis/batch-predict.md b/_ml-commons-plugin/api/model-apis/batch-predict.md
index b32fbb108d..c1dc7348fe 100644
--- a/_ml-commons-plugin/api/model-apis/batch-predict.md
+++ b/_ml-commons-plugin/api/model-apis/batch-predict.md
@@ -31,7 +31,13 @@ POST /_plugins/_ml/models//_batch_predict
## Prerequisites
-Before using the Batch Predict API, you need to create a connector to the externally hosted model. For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request:
+Before using the Batch Predict API, you need to create a connector to the externally hosted model. For each action, specify the `action_type` parameter that describes the action:
+
+- `batch_predict`: Runs the batch predict operation.
+- `batch_predict_status`: Checks the batch predict operation status.
+- `cancel_batch_predict`: Cancels the batch predict operation.
+
+For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request. The `cancel_batch_predict` action is optional and supports canceling the batch job running on OpenAI:
```json
POST /_plugins/_ml/connectors/_create
@@ -68,6 +74,22 @@ POST /_plugins/_ml/connectors/_create
"Authorization": "Bearer ${credential.openAI_key}"
},
"request_body": "{ \"input_file_id\": \"${parameters.input_file_id}\", \"endpoint\": \"${parameters.endpoint}\", \"completion_window\": \"24h\" }"
+ },
+ {
+ "action_type": "batch_predict_status",
+ "method": "GET",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
+ },
+ {
+ "action_type": "cancel_batch_predict",
+ "method": "POST",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}/cancel",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
}
]
}
@@ -123,45 +145,87 @@ POST /_plugins/_ml/models/lyjxwZABNrAVdFa9zrcZ/_batch_predict
#### Example response
+The response contains the task ID for the batch predict operation:
+
```json
{
- "inference_results": [
- {
- "output": [
- {
- "name": "response",
- "dataAsMap": {
- "id": "batch_",
- "object": "batch",
- "endpoint": "/v1/embeddings",
- "errors": null,
- "input_file_id": "file-",
- "completion_window": "24h",
- "status": "validating",
- "output_file_id": null,
- "error_file_id": null,
- "created_at": 1722037257,
- "in_progress_at": null,
- "expires_at": 1722123657,
- "finalizing_at": null,
- "completed_at": null,
- "failed_at": null,
- "expired_at": null,
- "cancelling_at": null,
- "cancelled_at": null,
- "request_counts": {
- "total": 0,
- "completed": 0,
- "failed": 0
- },
- "metadata": null
- }
- }
- ],
- "status_code": 200
- }
- ]
+ "task_id": "KYZSv5EBqL2d0mFvs80C",
+ "status": "CREATED"
}
```
-For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
\ No newline at end of file
+To check the status of the batch predict job, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). You can find the job details in the `remote_job` field in the task. Once the prediction is complete, the task `state` changes to `COMPLETED`.
+
+#### Example request
+
+```json
+GET /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+The response contains the batch predict operation details in the `remote_job` field:
+
+```json
+{
+ "model_id": "JYZRv5EBqL2d0mFvKs1E",
+ "task_type": "BATCH_PREDICTION",
+ "function_name": "REMOTE",
+ "state": "RUNNING",
+ "input_type": "REMOTE",
+ "worker_node": [
+ "Ee5OCIq0RAy05hqQsNI1rg"
+ ],
+ "create_time": 1725491751455,
+ "last_update_time": 1725491751455,
+ "is_async": false,
+ "remote_job": {
+ "cancelled_at": null,
+ "metadata": null,
+ "request_counts": {
+ "total": 3,
+ "completed": 3,
+ "failed": 0
+ },
+ "input_file_id": "file-XXXXXXXXXXXX",
+ "output_file_id": "file-XXXXXXXXXXXXX",
+ "error_file_id": null,
+ "created_at": 1725491753,
+ "in_progress_at": 1725491753,
+ "expired_at": null,
+ "finalizing_at": 1725491757,
+ "completed_at": null,
+ "endpoint": "/v1/embeddings",
+ "expires_at": 1725578153,
+ "cancelling_at": null,
+ "completion_window": "24h",
+ "id": "batch_XXXXXXXXXXXXXXX",
+ "failed_at": null,
+ "errors": null,
+ "object": "batch",
+ "status": "in_progress"
+ }
+}
+```
+
+For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
+
+### Canceling a batch predict job
+
+You can also cancel the batch predict operation running on the remote platform using the task ID returned by the batch predict request. To add this capability, set the `action_type` to `cancel_batch_predict` in the connector configuration when creating the connector.
+
+#### Example request
+
+```json
+POST /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C/_cancel_batch
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+```json
+{
+ "status": "OK"
+}
+```
diff --git a/_ml-commons-plugin/remote-models/async-batch-ingestion.md b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
new file mode 100644
index 0000000000..a09c028477
--- /dev/null
+++ b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
@@ -0,0 +1,190 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+nav_order: 90
+parent: Connecting to externally hosted models
+grand_parent: Integrating ML models
+---
+
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+[Batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/batch-ingestion/) configures an ingest pipeline, which processes documents one by one. For each document, batch ingestion calls an externally hosted model to generate text embeddings from the document text and then ingests the document, including text and embeddings, into an OpenSearch index.
+
+An alternative to this real-time process, _asynchronous_ batch ingestion, ingests both documents and their embeddings generated outside of OpenSearch and stored on a remote file server, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. Asynchronous ingestion returns a task ID and runs asynchronously to ingest data offline into your k-NN cluster for neural search. You can use asynchronous batch ingestion together with the [Batch Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) to perform inference asynchronously. The batch predict operation takes an input file containing documents and calls an externally hosted model to generate embeddings for those documents in an output file. You can then use asynchronous batch ingestion to ingest both the input file containing documents and the output file containing their embeddings into an OpenSearch index.
+
+As of OpenSearch 2.17, the Asynchronous Batch Ingestion API is supported by Amazon SageMaker, Amazon Bedrock, and OpenAI.
+{: .note}
+
+## Prerequisites
+
+Before using asynchronous batch ingestion, you must generate text embeddings using a model of your choice and store the output on a file server, such as Amazon S3. For example, you can store the output of a Batch API call to an Amazon SageMaker text embedding model in a file with the Amazon S3 output path `s3://offlinebatch/output/sagemaker_batch.json.out`. The output is in JSONL format, with each line representing a text embedding result. The file contents have the following format:
+
+```
+{"SageMakerOutput":[[-0.017166402,0.055771016,...],[-0.06422759,-0.004301484,...],"content":["this is chapter 1","harry potter"],"id":1}
+{"SageMakerOutput":[[-0.017455402,0.023771016,...],[-0.02322759,-0.009101284,...],"content":["this is chapter 2","draco malfoy"],"id":1}
+...
+```
+
+## Ingesting data from a single file
+
+First, create a k-NN index into which you'll ingest the data. The fields in the k-NN index represent the structure of the data in the source file.
+
+In this example, the source file holds documents containing titles and chapters, along with their corresponding embeddings. Thus, you'll create a k-NN index with the fields `id`, `chapter_embedding`, `chapter`, `title_embedding`, and `title`:
+
+```json
+PUT /my-nlp-index
+{
+ "settings": {
+ "index.knn": true
+ },
+ "mappings": {
+ "properties": {
+ "id": {
+ "type": "text"
+ },
+ "chapter_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "chapter": {
+ "type": "text"
+ },
+ "title_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "title": {
+ "type": "text"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+When using an S3 file as the source for asynchronous batch ingestion, you must map the fields in the source file to fields in the index in order to indicate into which index each piece of data is ingested. If no JSON path is provided for a field, that field will be set to `null` in the k-NN index.
+
+In the `field_map`, indicate the location of the data for each field in the source file. You can also specify fields to be ingested directly into your index without making any changes to the source file by adding their JSON paths to the `ingest_fields` array. For example, in the following asynchronous batch ingestion request, the element with the JSON path `$.id` from the source file is ingested directly into the `id` field of your index. To ingest this data from the Amazon S3 file, send the following request to your OpenSearch endpoint:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+
+## Ingesting data from multiple files
+
+You can also ingest data from multiple files by specifying the file locations in the `source`. The following example ingests data from three OpenAI files.
+
+The OpenAI Batch API input file is formatted as follows:
+
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of life?", "The food was delicious and the waiter..."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of work?", "The travel was fantastic and the view..."]}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of friend?", "The old friend was far away and the time..."]}}
+...
+```
+
+The OpenAI Batch API output file is formatted as follows:
+
+```
+{"id": "batch_req_ITKQn29igorXCAGp6wzYs5IS", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "10845755592510080d13054c3776aef4", "body": {"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0044326545, ... ...]}, {"object": "embedding", "index": 1, "embedding": [0.002297497, ... ... ]}], "model": "text-embedding-ada-002", "usage": {"prompt_tokens": 15, "total_tokens": 15}}}, "error": null}
+...
+```
+
+If you have run the Batch API in OpenAI for text embedding and want to ingest the model input and output files along with some metadata into your index, send the following asynchronous ingestion request. Make sure to use `source[file-index]` to identify the file's location in the source array in the request body. For example, `source[0]` refers to the first file in the `data_source.source` array.
+
+The following request ingests seven fields into your index: Five are specified in the `field_map` section and two are specified in `ingest_fields`. The format follows the pattern `sourcefile.jsonPath`, indicating the JSON path for each file. In the field_map, `$.body.input[0]` is used as the JSON path to ingest data into the `question` field from the second file in the `source` array. The `ingest_fields` array lists all elements from the `source` files that will be ingested directly into your index:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+In the request, make sure to define the `_id` field in the `field_map`. This is necessary in order to map each data entry from the three separate files.
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+For request field descriptions, see [Asynchronous Batch Ingestion API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/async-batch-ingest/).
\ No newline at end of file
diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md
index 254a21b068..9b95c31166 100644
--- a/_ml-commons-plugin/remote-models/blueprints.md
+++ b/_ml-commons-plugin/remote-models/blueprints.md
@@ -55,19 +55,20 @@ As an ML developer, you can build connector blueprints for other platforms. Usin
## Configuration parameters
-| Field | Data type | Is required | Description |
-|:---|:---|:---|:---|
-| `name` | String | Yes | The name of the connector. |
-| `description` | String | Yes | A description of the connector. |
-| `version` | Integer | Yes | The version of the connector. |
-| `protocol` | String | Yes | The protocol for the connection. For AWS services such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint` and `model`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
-| `credential` | JSON object | Yes | Defines any credential variables required to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Yes | Defines what actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
-| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
-| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| Field | Data type | Is required | Description |
+|:-------------------------------------------------|:---|:------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | Yes | The name of the connector. |
+| `description` | String | Yes | A description of the connector. |
+| `version` | Integer | Yes | The connector version. |
+| `protocol` | String | Yes | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint`, `model`, and `skip_validating_missing_parameters`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
+| `credential` | JSON object | Yes | Defines any credential variables required for connecting to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the cluster connection is initiated, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
+| `actions` | JSON array | Yes | Defines the actions that can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
+| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| `parameters.skip_validating_missing_parameters` | Boolean | No | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
The `actions` parameter supports the following options.
@@ -76,12 +77,11 @@ The `actions` parameter supports the following options.
|:---|:---|:---|
| `action_type` | String | Required. Sets the ML Commons API operation to use upon connection. As of OpenSearch 2.9, only `predict` is supported. |
| `method` | String | Required. Defines the HTTP method for the API call. Supports `POST` and `GET`. |
-| `url` | String | Required. Sets the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints). |
-| `headers` | JSON object | Sets the headers used inside the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
+| `url` | String | Required. Specifies the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints).|
| `request_body` | String | Required. Sets the parameters contained in the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. |
| `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://platform.openai.com/docs/guides/embeddings) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
-
+| `headers` | JSON object | Specifies the headers used in the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
The `client_config` parameter supports the following options.
diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
index 7061d3cb5a..c4cc27f660 100644
--- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
+++ b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
@@ -7,7 +7,7 @@ nav_order: 10
# Semantic search using byte-quantized vectors
-This tutorial illustrates how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector).
+This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors).
The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors.
diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md
index bb4f9c6c30..156e985e8b 100644
--- a/_monitoring-your-cluster/pa/index.md
+++ b/_monitoring-your-cluster/pa/index.md
@@ -60,7 +60,7 @@ private-key-file-path = specify_path
The Performance Analyzer plugin is included in the installations for [Docker]({{site.url}}{{site.baseurl}}/opensearch/install/docker/) and [tarball]({{site.url}}{{site.baseurl}}/opensearch/install/tar/), but you can also install the plugin manually.
-To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://search.maven.org/search?q=org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
+To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://central.sonatype.com/namespace/org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
To start the Performance Analyzer root cause analysis (RCA) agent on a tarball installation, run the following command:
diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md
index 679237094a..ad6fa5950b 100644
--- a/_observing-your-data/ad/dashboards-anomaly-detection.md
+++ b/_observing-your-data/ad/dashboards-anomaly-detection.md
@@ -18,12 +18,12 @@ You can connect data visualizations to OpenSearch datasets and then create, run,
Before getting started, you must have:
- Installed OpenSearch and OpenSearch Dashboards version 2.9 or later. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/).
-- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins).
+- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]/({{site.url}}{{site.baseurl}}/install-and-configure/plugins/).
- Installed the Anomaly Detection Dashboards plugin version 2.9 or later. See [Managing OpenSearch Dashboards plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/) to get started.
## General requirements for anomaly detection visualizations
-Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information on real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-set-up-detector-jobs).
+Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information about real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-setting-up-detector-jobs).
Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization:
diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md
index 5dfa1b8f1a..657c3c90cb 100644
--- a/_observing-your-data/ad/index.md
+++ b/_observing-your-data/ad/index.md
@@ -10,30 +10,42 @@ redirect_from:
# Anomaly detection
-An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure.
+An _anomaly_ in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric can help identify early signs of a system failure.
-It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior.
+Conventional techniques like visualizations and dashboards can make it difficult to uncover anomalies. Configuring alerts based on static thresholds is possible, but this approach requires prior domain knowledge and may not adapt to data with organic growth or seasonal trends.
-Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
+Anomaly detection automatically detects anomalies in your OpenSearch data in near real time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an _anomaly grade_ and _confidence score_ value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Robust Random Cut Forest Based Anomaly Detection on Streams](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
You can pair the Anomaly Detection plugin with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to notify you as soon as an anomaly is detected.
+{: .note}
-To get started, choose **Anomaly Detection** in OpenSearch Dashboards.
-To first test with sample streaming data, you can try out one of the preconfigured detectors with one of the sample datasets.
+## Getting started with anomaly detection in OpenSearch Dashboards
+
+To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Anomaly Detection**.
## Step 1: Define a detector
-A detector is an individual anomaly detection task. You can define multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources.
+A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps:
+
+1. On the **Anomaly detection** page, select the **Create detector** button.
+2. On the **Define detector** page, enter the required information in the **Detector details** pane.
+3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias.
+4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
+#### Example: Filtering data using query DSL
+
+The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values:
+=======
1. Choose **Create detector**.
1. Add in the detector details.
- Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector.
1. Specify the data source.
- - For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indexes.
+ - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes.
+ - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation.
- (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
-#### Example filter using query DSL
-The query is designed to retrieve documents in which the `urlPath.keyword` field matches one of the following specified values:
+To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`.
+{: .note}
- /domain/{id}/short
- /sub_dir/{id}/short
@@ -62,40 +74,38 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
}
}
```
+ {% include copy-curl.html %}
-1. Specify a timestamp.
- - Select the **Timestamp field** in your index.
-1. Define operation settings.
- - For **Operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data.
- - The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model.
- The shorter you set this interval, the fewer data points the detector aggregates.
- The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals.
-
- - We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process.
+5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu.
- - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
+6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data.
+ - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals.
+ - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process.
+ - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
- This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay.
- - For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
-1. Specify custom results index.
- - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. To enable this, select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, like `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+ - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
+ - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection.
- You can use the dash “-” sign to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the "financial" department at a granular level for the "us" area.
+7. Specify a custom results index.
+ - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+
+ You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group.
{: .note }
- When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards.
- If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis.
- If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file.
- - To use the custom results index option, you need the following permissions:
- - `indices:admin/create` - The Anomaly Detection plugin requires the ability to create and roll over the custom index.
- - `indices:admin/aliases` - The Anomaly Detection plugin requires access to create and manage an alias for the custom index.
- - `indices:data/write/index` - You need the `write` permission for the Anomaly Detection plugin to write results into the custom index for a single-entity detector.
- - `indices:data/read/search` - You need the `search` permission because the Anomaly Detection plugin needs to search custom results indexes to show results on the Anomaly Detection UI.
- - `indices:data/write/delete` - Because the detector might generate a large number of anomaly results, you need the `delete` permission to delete old data and save disk space.
- - `indices:data/write/bulk*` - You need the `bulk*` permission because the Anomaly Detection plugin uses the bulk API to write results into the custom index.
- - Managing the custom results index:
- - The anomaly detection dashboard queries all detectors’ results from all custom results indexes. Having too many custom results indexes might impact the performance of the Anomaly Detection plugin.
- - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to rollover old results indexes. You can also manually delete or archive any old results indexes. We recommend reusing a custom results index for multiple detectors.
- - The Anomaly Detection plugin also provides lifecycle management for custom indexes. It rolls an alias over to a new index when the custom results index meets any of the conditions in the following table.
+ - To use the custom results index option, you must have the following permissions:
+ - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index.
+ - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index.
+ - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector.
+ - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface.
+ - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space.
+ - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index.
+ - When managing the custom results index, consider the following:
+ - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance.
+ - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended.
+ - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table.
Parameter | Description | Type | Unit | Example | Required
:--- | :--- |:--- |:--- |:--- |:---
@@ -103,43 +113,52 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
`result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No
`result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No
-1. Choose **Next**.
+8. Choose **Next**.
After you define the detector, the next step is to configure the model.
## Step 2: Configure the model
-#### Add features to your detector
+1. Add features to your detector.
-A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
+A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature.
-A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting.
-{: .note }
+A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting.
+{: .note}
+
+### Configuring a model based on an aggregation method
To configure an anomaly detection model based on an aggregation method, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Field Value**.
-1. For **aggregation method**, select either **average()**, **count()**, **sum()**, **min()**, or **max()**.
-1. For **Field**, select from the available options.
+1. On the **Detectors** page, select the desired detector from the list.
+2. On the detector's details page, select the **Actions** button to activate the dropdown menu and then select **Edit model configuration**.
+3. On the **Edit model configuration** page, select the **Add another feature** button.
+4. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+5. Select **Field value** from the dropdown menu under **Find anomalies based on**.
+6. Select the desired aggregation from the dropdown menu under **Aggregation method**.
+7. Select the desired field from the options listed in the dropdown menu under **Field**.
+8. Select the **Save changes** button.
+
+### Configuring a model based on a JSON aggregation query
To configure an anomaly detection model based on a JSON aggregation query, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Custom expression**. You will see the JSON editor window open up.
-1. Enter your JSON aggregation query in the editor.
-For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/)
-{: .note }
+1. On the **Edit model configuration** page, select the **Add another feature** button.
+2. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+3. Select **Custom expression** from the dropdown menu under **Find anomalies based on**. The JSON editor window will open.
+4. Enter your JSON aggregation query in the editor.
+5. Select the **Save changes** button.
-#### (Optional) Set category fields for high cardinality
+For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/).
+{: .note}
-You can categorize anomalies based on a keyword or IP field type.
+### Setting categorical fields for high cardinality
-The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues.
+You can categorize anomalies based on a keyword or IP field type. You can enable the **Categorical fields** option to categorize, or "slice," the source time series using a dimension, such as an IP address, a product ID, or a country code. This gives you a granular view of anomalies within each entity of the category field to help isolate and debug issues.
-To set a category field, choose **Enable a category field** and select a field. You can’t change the category fields after you create the detector.
+To set a category field, choose **Enable categorical fields** and select a field. You cannot change the category fields after you create the detector.
Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster:
@@ -147,7 +166,7 @@ Only a certain number of unique entities are supported in the category field. Us
(data nodes * heap size * anomaly detection maximum memory percentage) / (entity model size of a detector)
```
-To get the entity model size of a detector, use the [profile detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage with the `plugins.anomaly_detection.model_max_size_percent` setting.
+To get the detector's entity model size, use the [Profile Detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage using the `plugins.anomaly_detection.model_max_size_percent` setting.
Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the default 10% memory allocation. With an entity model size of 1 MB, the following formula calculates the estimated number of unique entities:
@@ -155,81 +174,109 @@ Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the de
(8096 MB * 0.1 / 1 MB ) * 3 = 2429
```
-If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), the anomaly detector will attempt to model the extra entities. The detector prioritizes entities that occur more often and are more recent.
+If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), then the anomaly detector attempts to model the extra entities. The detector prioritizes both entities that occur more often and are more recent.
-This formula serves as a starting point. Make sure to test it with a representative workload. You can find more information in the [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) blog post.
+This formula serves as a starting point. Make sure to test it with a representative workload. See the OpenSearch blog post [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) for more information.
{: .note }
-#### (Advanced settings) Set a shingle size
+### Setting a shingle size
-Set the number of aggregation intervals from your data stream to consider in a detection window. It’s best to choose this value based on your actual data to see which one leads to the best results for your use case.
+In the **Advanced settings** pane, you can set the number of data stream aggregation intervals to include in the detection window. Choose this value based on your actual data to find the optimal setting for your use case. To set the shingle size, select **Show** in the **Advanced settings** pane. Enter the desired size in the **intervals** field.
-The anomaly detector expects the shingle size to be in the range of 1 and 60. The default shingle size is 8. We recommend that you don't choose 1 unless you have two or more features. Smaller values might increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also false positives. Larger values might be useful for ignoring noise in a signal.
+The anomaly detector requires the shingle size to be between 1 and 128. The default is `8`. Use `1` only if you have at least two features. Values of less than `8` may increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also may increase false positives. Values greater than `8` may be useful for ignoring noise in a signal.
-#### Preview sample anomalies
+### Setting an imputation option
-Preview sample anomalies and adjust the feature settings if needed.
-For sample previews, the Anomaly Detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results.
+In the **Advanced settings** pane, you can set the imputation option. This allows you to manage missing data in your streams. The options include the following:
-Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results.
+- **Ignore Missing Data (Default):** The system continues without considering missing data points, keeping the existing data flow.
+- **Fill with Custom Values:** Specify a custom value for each feature to replace missing data points, allowing for targeted imputation tailored to your data.
+- **Fill with Zeros:** Replace missing values with zeros. This is ideal when the absence of data indicates a significant event, such as a drop to zero in event counts.
+- **Use Previous Values:** Fill gaps with the last observed value to maintain continuity in your time-series data. This method treats missing data as non-anomalous, carrying forward the previous trend.
-1. Choose **Preview sample anomalies**.
- - If you don't see any sample anomaly result, check the detector interval and make sure you have more than 400 data points for some entities during the preview date range.
-1. Choose **Next**.
+Using these options can improve recall in anomaly detection. For instance, if you are monitoring for drops in event counts, including both partial and complete drops, then filling missing values with zeros helps detect significant data absences, improving detection recall.
+
+Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information.
+{: note}
+
+### Suppressing anomalies with threshold-based rules
+
+In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations.
+
+Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns.
+
+To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules:
-## Step 3: Set up detector jobs
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value.
+```
+
+Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features.
+{: .note}
+
+If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds:
-To start a real-time detector to find anomalies in your data in near real-time, check **Start real-time detector automatically (recommended)**.
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value.
+```
-Alternatively, if you want to perform historical analysis and find patterns in long historical data windows (weeks or months), check **Run historical analysis detection** and select a date range (at least 128 detection intervals).
+If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature.
-Analyzing historical data helps you get familiar with the Anomaly Detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it.
+### Previewing sample anomalies
-We recommend experimenting with historical analysis with different feature sets and checking the precision before moving on to real-time detectors.
+You can preview anomalies based on sample feature input and adjust the feature settings as needed. The Anomaly Detection plugin selects a small number of data samples---for example, 1 data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. The sample dataset is loaded into the detector, which then uses the sample dataset to generate a preview of the anomalies.
+
+1. Choose **Preview sample anomalies**.
+ - If sample anomaly results are not displayed, check the detector interval to verify that 400 or more data points are set for the entities during the preview date range.
+2. Select the **Next** button.
-## Step 4: Review and create
+## Step 3: Setting up detector jobs
-Review your detector settings and model configurations to make sure that they're valid and then select **Create detector**.
+To start a detector to find anomalies in your data in near real time, select **Start real-time detector automatically (recommended)**.
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/review_ad.png)
+Alternatively, if you want to perform historical analysis and find patterns in longer historical data windows (weeks or months), select the **Run historical analysis detection** box and select a date range of at least 128 detection intervals.
-If you see any validation errors, edit the settings to fix the errors and then return back to this page.
+Analyzing historical data can help to familiarize you with the Anomaly Detection plugin. For example, you can evaluate the performance of a detector against historical data in order to fine-tune it.
+
+You can experiment with historical analysis by using different feature sets and checking the precision before using real-time detectors.
+
+## Step 4: Reviewing detector settings
+
+Review your detector settings and model configurations to confirm that they are valid and then select **Create detector**.
+
+If a validation error occurs, edit the settings to correct the error and return to the detector page.
{: .note }
-## Step 5: Observe the results
+## Step 5: Observing the results
-Choose the **Real-time results** or **Historical analysis** tab. For real-time results, you need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, because its waiting for sufficient data to generate anomalies.
+Choose either the **Real-time results** or **Historical analysis** tab. For real-time results, it will take some time to display the anomaly results. For example, if the detector interval is 10 minutes, then the detector may take an hour to initiate because it is waiting for sufficient data to be able to generate anomalies.
-A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner.
-Use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to make sure you have sufficient data points.
+A shorter interval results in the model passing the shingle process more quickly and generating anomaly results sooner. You can use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to ensure that you have enough data points.
-If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval.
+If the detector is pending in "initialization" for longer than 1 day, aggregate your existing data and use the detector interval to check for any missing data points. If you find many missing data points, consider increasing the detector interval.
-Choose and drag over the anomaly line chart to zoom in and see a more detailed view of an anomaly.
+Click and drag over the anomaly line chart to zoom in and see a detailed view of an anomaly.
{: .note }
-Analyze anomalies with the following visualizations:
+You can analyze anomalies using the following visualizations:
-- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
-- **Anomaly overview** (for real-time results) / **Anomaly history** (for historical analysis in the **Historical analysis** tab) plots the anomaly grade with the corresponding measure of confidence. This pane includes:
+- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is `10`, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
+- **Anomaly overview** (for real-time results) or **Anomaly history** (for historical analysis on the **Historical analysis** tab) plot the anomaly grade with the corresponding measure of confidence. The pane includes:
- The number of anomaly occurrences based on the given data-time range.
- - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly.
+ - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of `0` represents "not an anomaly," and a non-zero value represents the relative severity of the anomaly.
- **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy.
- **Last anomaly occurrence** is the time at which the last anomaly occurred.
-Underneath **Anomaly overview**/**Anomaly history** are:
+Underneath **Anomaly overview** or **Anomaly history** are:
- **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal.
- ![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
- **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly.
Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
-
If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0).
@@ -249,7 +296,7 @@ To see all the configuration settings for a detector, choose the **Detector conf
1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**.
- You need to stop real-time and historical analysis to change its configuration. Confirm that you want to stop the detector and proceed.
-1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
+2. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
## Step 8: Manage your detectors
diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md
index 7e1482a013..967b185684 100644
--- a/_observing-your-data/ad/result-mapping.md
+++ b/_observing-your-data/ad/result-mapping.md
@@ -9,9 +9,7 @@ redirect_from:
# Anomaly result mapping
-If you enabled custom result index, the anomaly detection plugin stores the results in your own index.
-
-If the anomaly detector doesn’t detect an anomaly, the result has the following format:
+When you select the **Enable custom result index** box on the **Custom result index** pane, the Anomaly Detection plugin will save the results to an index of your choosing. When the anomaly detector does not detect an anomaly, the result format is as follows:
```json
{
@@ -61,6 +59,7 @@ If the anomaly detector doesn’t detect an anomaly, the result has the followin
"threshold": 1.2368549346675202
}
```
+{% include copy-curl.html %}
## Response body fields
@@ -80,7 +79,83 @@ Field | Description
`model_id` | A unique ID that identifies a model. If a detector is a single-stream detector (with no category field), it has only one model. If a detector is a high-cardinality detector (with one or more category fields), it might have multiple models, one for each entity.
`threshold` | One of the criteria for a detector to classify a data point as an anomaly is that its `anomaly_score` must surpass a dynamic threshold. This field records the current threshold.
-If an anomaly detector detects an anomaly, the result has the following format:
+When the imputation option is enabled, the anomaly results include a `feature_imputed` array showing which features were modified due to missing data. If no features were imputed, then this is excluded.
+
+In the following example anomaly result output, the `processing_bytes_max` feature was imputed, as shown by the `imputed: true` status:
+
+```json
+{
+ "detector_id": "kzcZ43wBgEQAbjDnhzGF",
+ "schema_version": 5,
+ "data_start_time": 1635898161367,
+ "data_end_time": 1635898221367,
+ "feature_data": [
+ {
+ "feature_id": "processing_bytes_max",
+ "feature_name": "processing bytes max",
+ "data": 2322
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "feature_name": "processing bytes avg",
+ "data": 1718.6666666666667
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "feature_name": "processing bytes min",
+ "data": 1375
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "feature_name": "processing bytes sum",
+ "data": 5156
+ },
+ {
+ "feature_id": "processing_time_max",
+ "feature_name": "processing time max",
+ "data": 31198
+ }
+ ],
+ "execution_start_time": 1635898231577,
+ "execution_end_time": 1635898231622,
+ "anomaly_score": 1.8124904404395776,
+ "anomaly_grade": 0,
+ "confidence": 0.9802940756605277,
+ "entity": [
+ {
+ "name": "process_name",
+ "value": "process_3"
+ }
+ ],
+ "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_process_3",
+ "threshold": 1.2368549346675202,
+ "feature_imputed": [
+ {
+ "feature_id": "processing_bytes_max",
+ "imputed": true
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_time_max",
+ "imputed": false
+ }
+ ]
+}
+```
+{% include copy-curl.html %}
+
+When an anomaly is detected, the result is provided in the following format:
```json
{
@@ -179,24 +254,23 @@ If an anomaly detector detects an anomaly, the result has the following format:
"execution_start_time": 1635898427803
}
```
+{% include copy-curl.html %}
-You can see the following additional fields:
+Note that the result includes the following additional field.
Field | Description
:--- | :---
`relevant_attribution` | Represents the contribution of each input variable. The sum of the attributions is normalized to 1.
`expected_values` | The expected value for each feature.
-At times, the detector might detect an anomaly late.
-Let's say the detector sees a random mix of the triples {1, 2, 3} and {2, 4, 5} that correspond to `slow weeks` and `busy weeks`, respectively. For example 1, 2, 3, 1, 2, 3, 2, 4, 5, 1, 2, 3, 2, 4, 5, ... and so on.
-If the detector comes across a pattern {2, 2, X} and it's yet to see X, the detector infers that the pattern is anomalous, but it can't determine at this point which of the 2's is the cause. If X = 3, then the detector knows it's the first 2 in that unfinished triple, and if X = 5, then it's the second 2. If it's the first 2, then the detector detects the anomaly late.
+The detector may be late in detecting an anomaly. For example: The detector observes a sequence of data that alternates between "slow weeks" (represented by the triples {1, 2, 3}) and "busy weeks" (represented by the triples {2, 4, 5}). If the detector comes across a pattern {2, 2, X}, where it has not yet seen the value that X will take, then the detector infers that the pattern is anomalous. However, it cannot determine which 2 is the cause. If X = 3, then the first 2 is the anomaly. If X = 5, then the second 2 is the anomaly. If it is the first 2, then the detector will be late in detecting the anomaly.
-If a detector detects an anomaly late, the result has the following additional fields:
+When a detector is late in detecting an anomaly, the result includes the following additional fields.
Field | Description
:--- | :---
-`past_values` | The actual input that triggered an anomaly. If `past_values` is null, the attributions or expected values are from the current input. If `past_values` is not null, the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
-`approx_anomaly_start_time` | The approximate time of the actual input that triggers an anomaly. This field helps you understand when a detector flags an anomaly. Both single-stream and high-cardinality detectors don't query previous anomaly results because these queries are expensive operations. The cost is especially high for high-cardinality detectors that might have a lot of entities. If the data is not continuous, the accuracy of this field is low and the actual time that the detector detects an anomaly can be earlier.
+`past_values` | The actual input that triggered an anomaly. If `past_values` is `null`, then the attributions or expected values are from the current input. If `past_values` is not `null`, then the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
+`approx_anomaly_start_time` | The approximate time of the actual input that triggered an anomaly. This field helps you understand the time at which a detector flags an anomaly. Both single-stream and high-cardinality detectors do not query previous anomaly results because these queries are costly operations. The cost is especially high for high-cardinality detectors that may have many entities. If the data is not continuous, then the accuracy of this field is low and the actual time at which the detector detects an anomaly can be earlier.
```json
{
@@ -319,3 +393,4 @@ Field | Description
"approx_anomaly_start_time": 1635883620000
}
```
+{% include copy-curl.html %}
diff --git a/_observing-your-data/ad/security.md b/_observing-your-data/ad/security.md
index 8eeaa3df41..e4816cec46 100644
--- a/_observing-your-data/ad/security.md
+++ b/_observing-your-data/ad/security.md
@@ -23,6 +23,11 @@ As an admin user, you can use the Security plugin to assign specific permissions
The Security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles).
+If you use OpenSearch Dashboards to create your anomaly detectors, you may experience access issues even with `anomaly_full_access`. This issue has been resolved in OpenSearch 2.17, but for earlier versions, the following additional permissions need to be added:
+
+- `indices:data/read/search` -- You need this permission because the Anomaly Detection plugin needs to search the data source in order to validate whether there is enough data to train the model.
+- `indices:admin/mappings/fields/get` and `indices:admin/mappings/fields/get*` -- You need these permissions to validate whether the given data source has a valid timestamp field and categorical field (in the case of creating a high-cardinality detector).
+
If these roles don't meet your needs, mix and match individual anomaly detection [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors.
### A note on alerts and fine-grained access control
@@ -31,6 +36,42 @@ When a trigger generates an alert, the detector and monitor configurations, the
To reduce the chances of unintended users viewing metadata that could describe an index, we recommend that administrators enable role-based access control and keep these kinds of design elements in mind when assigning permissions to the intended group of users. See [Limit access by backend role](#advanced-limit-access-by-backend-role) for details.
+### Selecting remote indexes with fine-grained access control
+
+To use a remote index as a data source for a detector, see the setup steps in [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) in [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). You must use a role that exists in both the remote and local clusters. The remote cluster must map the chosen role to the same username as in the local cluster.
+
+---
+
+#### Example: Create a new user on the local cluster
+
+1. Create a new user on the local cluster to use for detector creation:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9200/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+```
+{% include copy-curl.html %}
+
+2. Map the new user to the `anomaly_full_access` role:
+
+```
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+3. On the remote cluster, create the same user and map `anomaly_full_access` to that role:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9250/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9250/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+---
+
+### Custom results index
+
+To use a custom results index, you need additional permissions not included in the default roles provided by the OpenSearch Security plugin. To add these permissions, see [Step 1: Define a detector]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-1-define-a-detector) in the [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/) documentation.
+
## (Advanced) Limit access by backend role
Use backend roles to configure fine-grained access to individual detectors based on roles. For example, users of different departments in an organization can view detectors owned by their own department.
diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md
new file mode 100644
index 0000000000..28cbcbb8e5
--- /dev/null
+++ b/_observing-your-data/query-insights/grouping-top-n-queries.md
@@ -0,0 +1,331 @@
+---
+layout: default
+title: Grouping top N queries
+parent: Query insights
+nav_order: 20
+---
+
+# Grouping top N queries
+**Introduced 2.17**
+{: .label .label-purple }
+
+Monitoring the [top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) can help you to identify the most resource-intensive queries based on latency, CPU, and memory usage in a specified time window. However, if a single computationally expensive query is executed multiple times, it can occupy all top N query slots, potentially preventing other expensive queries from appearing in the list. To address this issue, you can group similar queries, gaining insight into various high-impact query groups.
+
+Starting with OpenSearch version 2.17, the top N queries can be grouped by `similarity`, with additional grouping options planned for future version releases.
+
+## Grouping queries by similarity
+
+Grouping queries by `similarity` organizes them based on the query structure, removing everything except the core query operations.
+
+For example, the following query:
+
+```json
+{
+ "query": {
+ "bool": {
+ "must": [
+ { "exists": { "field": "field1" } }
+ ],
+ "query_string": {
+ "query": "search query"
+ }
+ }
+ }
+}
+```
+
+Has the following corresponding query structure:
+
+```c
+bool
+ must
+ exists
+ query_string
+```
+
+When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group.
+
+
+## Aggregate metrics per group
+
+In addition to retrieving latency, CPU, and memory metrics for individual top N queries, you can obtain aggregate statistics for the
+top N query groups. For each query group, the response includes the following statistics:
+- The total latency, CPU usage, or memory usage (depending on the configured metric type)
+- The total query count
+
+Using these statistics, you can calculate the average latency, CPU usage, or memory usage for each query group.
+The response also includes one example query from the query group.
+
+## Configuring query grouping
+
+Before you enable query grouping, you must enable top N query monitoring for a metric type of your choice. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+To configure grouping for top N queries, use the following steps.
+
+### Step 1: Enable top N query monitoring
+
+Ensure that top N query monitoring is enabled for at least one of the metrics: latency, CPU, or memory. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+For example, to enable top N query monitoring by latency with the default settings, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.latency.enabled" : true
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Step 2: Configure query grouping
+
+Set the desired grouping method by updating the following cluster setting:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.group_by" : "similarity"
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for the `group_by` setting is `none`, which disables grouping. As of OpenSearch 2.17, the supported values for `group_by` are `similarity` and `none`.
+
+### Step 3 (Optional): Limit the number of monitored query groups
+
+Optionally, you can limit the number of monitored query groups. Queries already included in the top N query list (the most resource-intensive queries) will not be considered in determining the limit. Essentially, the maximum applies only to other query groups, and the top N queries are tracked separately. This helps manage the tracking of query groups based on workload and query window size.
+
+To limit tracking to 100 query groups, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.max_groups_excluding_topn" : 100
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for `max_groups_excluding_topn` is `100`, and you can set it to any value between `0` and `10,000`, inclusive.
+
+## Monitoring query groups
+
+To view the top N query groups, send the following request:
+
+```json
+GET /_insights/top_queries
+```
+{% include copy-curl.html %}
+
+The response contains the top N query groups:
+
+
+
+ Response
+
+ {: .text-delta}
+
+```json
+{
+ "top_queries": [
+ {
+ "timestamp": 1725495127359,
+ "source": {
+ "query": {
+ "match_all": {
+ "boost": 1.0
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 55,
+ "fetch": 3
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "b4c4f69290df756021ca6276be5cbb75",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 30,
+ "parentTaskId": 29,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 33249000,
+ "memory_in_bytes": 2896848
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 29,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 3151000,
+ "memory_in_bytes": 133936
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 160,
+ "count": 10,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495135160,
+ "source": {
+ "query": {
+ "term": {
+ "content": {
+ "value": "first",
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 18,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "c3620cc3d4df30fb3f95aeb2167289a4",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 50,
+ "parentTaskId": 49,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 10188000,
+ "memory_in_bytes": 288136
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 49,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 262000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 109,
+ "count": 7,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495139766,
+ "source": {
+ "query": {
+ "match": {
+ "content": {
+ "query": "first",
+ "operator": "OR",
+ "prefix_length": 0,
+ "max_expansions": 50,
+ "fuzzy_transpositions": true,
+ "lenient": false,
+ "zero_terms_query": "NONE",
+ "auto_generate_synonyms_phrase_query": true,
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 15,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "484eaabecd13db65216b9e2ff5eee999",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 64,
+ "parentTaskId": 63,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 12161000,
+ "memory_in_bytes": 473456
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 63,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 293000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 43,
+ "count": 3,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ }
+ ]
+}
+```
+
+
+
+## Response fields
+
+The response includes the following fields.
+
+Field | Data type | Description
+:--- |:---| :---
+`top_queries` | Array | The list of top query groups.
+`top_queries.timestamp` | Integer | The execution timestamp for the first query in the query group.
+`top_queries.source` | Object | The first query in the query group.
+`top_queries.phase_latency_map` | Object | The phase latency map for the first query in the query group. The map includes the amount of time, in milliseconds, that the query spent in the `expand`, `query`, and `fetch` phases.
+`top_queries.total_shards` | Integer | The number of shards on which the first query was executed.
+`top_queries.node_id` | String | The node ID of the node that coordinated the execution of the first query in the query group.
+`top_queries.query_hashcode` | String | The hash code that uniquely identifies the query group. This is essentially the hash of the [query structure](#grouping-queries-by-similarity).
+`top_queries.task_resource_usages` | Array of objects | The resource usage breakdown for the various tasks belonging to the first query in the query group.
+`top_queries.indices` | Array | The indexes searched by the first query in the query group.
+`top_queries.labels` | Object | Used to label the top query.
+`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters).
+`top_queries.measurements` | Object | The aggregate measurements for the query group.
+`top_queries.measurements.latency` | Object | The aggregate latency measurements for the query group.
+`top_queries.measurements.latency.number` | Integer | The total latency for the query group.
+`top_queries.measurements.latency.count` | Integer | The number of queries in the query group.
+`top_queries.measurements.latency.aggregationType` | String | The aggregation type for the current entry. If grouping by similarity is enabled, then `aggregationType` is `AVERAGE`. If it is not enabled, then `aggregationType` is `NONE`.
\ No newline at end of file
diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md
index 549371240f..ef3a65bfcd 100644
--- a/_observing-your-data/query-insights/index.md
+++ b/_observing-your-data/query-insights/index.md
@@ -7,8 +7,10 @@ has_toc: false
---
# Query insights
+**Introduced 2.12**
+{: .label .label-purple }
-To monitor and analyze the search queries within your OpenSearch clusterQuery information, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
+To monitor and analyze the search queries within your OpenSearch cluster, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
Typical use cases for query insights features include the following:
@@ -36,4 +38,5 @@ For information about installing plugins, see [Installing plugins]({{site.url}}{
You can obtain the following information using Query Insights:
- [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/)
+- [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/)
- [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/)
diff --git a/_observing-your-data/query-insights/query-metrics.md b/_observing-your-data/query-insights/query-metrics.md
index c8caf21d65..beac8d4e18 100644
--- a/_observing-your-data/query-insights/query-metrics.md
+++ b/_observing-your-data/query-insights/query-metrics.md
@@ -2,10 +2,12 @@
layout: default
title: Query metrics
parent: Query insights
-nav_order: 20
+nav_order: 30
---
# Query metrics
+**Introduced 2.16**
+{: .label .label-purple }
Key query [metrics](#metrics), such as aggregation types, query types, latency, and resource usage per query type, are captured along the search path by using the OpenTelemetry (OTel) instrumentation framework. The telemetry data can be consumed using OTel metrics [exporters]({{site.url}}{{site.baseurl}}/observing-your-data/trace/distributed-tracing/#exporters).
diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md
index f07fd2dfef..b63d670926 100644
--- a/_observing-your-data/query-insights/top-n-queries.md
+++ b/_observing-your-data/query-insights/top-n-queries.md
@@ -7,7 +7,7 @@ nav_order: 10
# Top N queries
-Monitoring the top N queries in query insights features can help you gain real-time insights into the top queries with high latency within a certain time frame (for example, the last hour).
+Monitoring the top N queries using query insights allows you to gain real-time visibility into the queries with the highest latency or resource consumption in a specified time period (for example, the last hour).
## Configuring top N query monitoring
@@ -72,14 +72,14 @@ PUT _cluster/settings
## Monitoring the top N queries
-You can use the Insights API endpoint to obtain the top N queries for all metric types:
+You can use the Insights API endpoint to retrieve the top N queries. This API returns top N `latency` results by default.
```json
GET /_insights/top_queries
```
{% include copy-curl.html %}
-Specify a metric type to filter the response:
+Specify the `type` parameter to retrieve the top N results for other metric types. The results will be sorted in descending order based on the specified metric type.
```json
GET /_insights/top_queries?type=latency
@@ -96,6 +96,9 @@ GET /_insights/top_queries?type=memory
```
{% include copy-curl.html %}
+If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size).
+{: .important}
+
## Exporting top N query data
You can configure your desired exporter to export top N query data to different sinks, allowing for better monitoring and analysis of your OpenSearch queries. Currently, the following exporters are supported:
diff --git a/_query-dsl/geo-and-xy/geo-bounding-box.md b/_query-dsl/geo-and-xy/geo-bounding-box.md
index 1112a4278e..66fcc224d6 100644
--- a/_query-dsl/geo-and-xy/geo-bounding-box.md
+++ b/_query-dsl/geo-and-xy/geo-bounding-box.md
@@ -173,11 +173,11 @@ GET testindex1/_search
```
{% include copy-curl.html %}
-## Request fields
+## Parameters
-Geo-bounding box queries accept the following fields.
+Geo-bounding box queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geodistance.md b/_query-dsl/geo-and-xy/geodistance.md
index b272cad81e..3eef58bc69 100644
--- a/_query-dsl/geo-and-xy/geodistance.md
+++ b/_query-dsl/geo-and-xy/geodistance.md
@@ -103,11 +103,11 @@ The response contains the matching document:
}
```
-## Request fields
+## Parameters
-Geodistance queries accept the following fields.
+Geodistance queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`distance` | String | The distance within which to match the points. This distance is the radius of a circle centered at the specified point. For supported distance units, see [Distance units]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#distance-units). Required.
diff --git a/_query-dsl/geo-and-xy/geopolygon.md b/_query-dsl/geo-and-xy/geopolygon.md
index 980a0c5a63..810e48f2b7 100644
--- a/_query-dsl/geo-and-xy/geopolygon.md
+++ b/_query-dsl/geo-and-xy/geopolygon.md
@@ -161,11 +161,11 @@ However, if you specify the vertices in the following order:
The response returns no results.
-## Request fields
+## Parameters
-Geopolygon queries accept the following fields.
+Geopolygon queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Optional. Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geoshape.md b/_query-dsl/geo-and-xy/geoshape.md
index 42948666f4..5b144b06d6 100644
--- a/_query-dsl/geo-and-xy/geoshape.md
+++ b/_query-dsl/geo-and-xy/geoshape.md
@@ -25,15 +25,15 @@ Relation | Description | Supporting geographic field type
## Defining the shape in a geoshape query
-You can define the shape to filter documents in a geoshape query either by providing a new shape definition at query time or by referencing the name of a shape pre-indexed in another index.
+You can define the shape to filter documents in a geoshape query either by [providing a new shape definition at query time](#using-a-new-shape-definition) or by [referencing the name of a shape pre-indexed in another index](#using-a-pre-indexed-shape-definition).
-### Using a new shape definition
+## Using a new shape definition
To provide a new shape to a geoshape query, define it in the `geo_shape` field. You must define the geoshape in [GeoJSON format](https://geojson.org/).
The following example illustrates searching for documents containing geoshapes that match a geoshape defined at query time.
-#### Step 1: Create an index
+### Step 1: Create an index
First, create an index and map the `location` field as a `geo_shape`:
@@ -422,7 +422,7 @@ GET /testindex/_search
Geoshape queries whose geometry collection contains a linestring or a multilinestring do not support the `WITHIN` relation.
{: .note}
-### Using a pre-indexed shape definition
+## Using a pre-indexed shape definition
When constructing a geoshape query, you can also reference the name of a shape pre-indexed in another index. Using this method, you can define a geoshape at index time and refer to it by name at search time.
@@ -721,10 +721,10 @@ The response returns document 1:
Note that when you indexed the geopoints, you specified their coordinates in `"latitude, longitude"` format. When you search for matching documents, the coordinate array is in `[longitude, latitude]` format. Thus, document 1 is returned in the results but document 2 is not.
-## Request fields
+## Parameters
-Geoshape queries accept the following fields.
+Geoshape queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`ignore_unmapped` | Boolean | Specifies whether to ignore an unmapped field. If set to `true`, then the query does not return any documents that contain an unmapped field. If set to `false`, then an exception is thrown when the field is unmapped. Optional. Default is `false`.
\ No newline at end of file
diff --git a/_query-dsl/joining/has-child.md b/_query-dsl/joining/has-child.md
new file mode 100644
index 0000000000..c7da5bf7a9
--- /dev/null
+++ b/_query-dsl/joining/has-child.md
@@ -0,0 +1,398 @@
+---
+layout: default
+title: Has child
+parent: Joining queries
+nav_order: 10
+---
+
+# Has child query
+
+The `has_child` query returns parent documents whose child documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_child` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching child documents pointing to different parent documents increases. Each `has_child` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_child` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+In this example, you'll configure an index that contains documents representing products and their brands.
+
+First, create the index and establish the parent/child relationship between `brand` and `product`:
+
+```json
+PUT testindex1
+{
+ "mappings": {
+ "properties": {
+ "product_to_brand": {
+ "type": "join",
+ "relations": {
+ "brand": "product"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Index two parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+Index three child (product) documents:
+
+```json
+PUT testindex1/_doc/3?routing=1
+{
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/4?routing=2
+{
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/5?routing=2
+{
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search for the parent of a child, use a `has_child` query. The following query returns parent documents (brands) that make watches:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns both brands:
+
+```json
+{
+ "took": 15,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return child documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains child documents in the `inner_hits` field:
+
+```json
+{
+ "took": 52,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.53899646,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_child` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on child documents. If a child document matches the query, the parent document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
+| `max_children` | Optional | The maximum number of matching child documents for a parent document. If exceeded, the parent document is excluded from the search results. |
+| `min_children` | Optional | The minimum number of matching child documents required for a parent document to be included in the results. If not met, the parent is excluded. Default is `1`.|
+| `score_mode` | Optional | Defines how scores of matching child documents influence the parent document's score. Valid values are:
- `none`: Ignores the relevance scores of child documents and assigns a score of `0` to the parent document.
- `avg`: Uses the average relevance score of all matching child documents.
- `max`: Assigns the highest relevance score from the matching child documents to the parent.
- `min`: Assigns the lowest relevance score from the matching child documents to the parent.
- `sum`: Sums the relevance scores of all matching child documents.
Default is `none`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (child documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_child` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort parent documents by fields in their child documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the parent document's score.
+
+In the preceding example, you can sort parent documents (brands) based on the `sales_count` of their child products. This query multiplies the score by the `sales_count` field of the child documents and assigns the highest relevance score from the matching child documents to the parent:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_child": {
+ "type": "product",
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['sales_count'].value"
+ }
+ }
+ },
+ "score_mode": "max"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the brands sorted by the highest child `sales_count`:
+
+```json
+{
+ "took": 6,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 300,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 300,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 150,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/has-parent.md b/_query-dsl/joining/has-parent.md
new file mode 100644
index 0000000000..6b293ffff2
--- /dev/null
+++ b/_query-dsl/joining/has-parent.md
@@ -0,0 +1,358 @@
+---
+layout: default
+title: Has parent
+parent: Joining queries
+nav_order: 20
+---
+
+# Has parent query
+
+The `has_parent` query returns child documents whose parent documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_parent` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching parent documents increases. Each `has_parent` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_parent` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for the child of a parent, use a `has_parent` query. The following query returns child documents (products) made by the brand matching the query `economy`:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns all products made by the brand:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return parent documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains parent documents in the `inner_hits` field:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_parent` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `parent_type` | Required | Specifies the name of the parent relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on parent documents. If a parent document matches the query, the child document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `parent_type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `parent_type` field. Default is `false`. |
+| `score` | Optional | Indicates whether the relevance score of a matching parent document is aggregated into its child documents. If `false`, then the relevance score of the parent document is ignored, and each child document is assigned a relevance score equal to the query's `boost`, which defaults to `1`. If `true`, then the relevance score of the matching parent document is aggregated into the relevance scores of its child documents. Default is `false`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (parent documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_parent` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort child documents by fields in their parent documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the child document's score.
+
+For the preceding example, first add a `customer_satisfaction` field by which you'll sort the child documents belonging to the parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 4.5
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 3.9
+}
+```
+{% include copy-curl.html %}
+
+Now you can sort child documents (products) based on the `customer_satisfaction` field of their parent brands. This query multiplies the score by the `customer_satisfaction` field of the parent documents:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_parent": {
+ "parent_type": "brand",
+ "score": true,
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['customer_satisfaction'].value"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the products, sorted by the highest parent `customer_satisfaction`:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 3,
+ "relation": "eq"
+ },
+ "max_score": 4.5,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 4.5,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/index.md b/_query-dsl/joining/index.md
index 20f48c0b16..f0a0060640 100644
--- a/_query-dsl/joining/index.md
+++ b/_query-dsl/joining/index.md
@@ -3,16 +3,22 @@ layout: default
title: Joining queries
has_children: true
nav_order: 55
+has_toc: false
+redirect_from:
+ - /query-dsl/joining/
---
# Joining queries
OpenSearch is a distributed system in which data is spread across multiple nodes. Thus, running a SQL-like JOIN operation in OpenSearch is resource intensive. As an alternative, OpenSearch provides the following queries that perform join operations and are optimized for scaling across multiple nodes:
-- `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
-- `has_child` queries: Search for parent documents whose child documents match the query.
-- `has_parent` queries: Search for child documents whose parent documents match the query.
-- `parent_id` queries: A [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field type establishes a parent/child relationship between documents in the same index. `parent_id` queries search for child documents that are joined to a specific parent document.
+
+- Queries for searching [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields:
+ - `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
+- Queries for searching documents connected by a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type, which establishes a parent/child relationship between documents in the same index:
+ - [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries: Search for parent documents whose child documents match the query.
+ - [`has_parent`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-parent/) queries: Search for child documents whose parent documents match the query.
+ - [`parent_id`]({{site.url}}{{site.baseurl}}/query-dsl/joining/parent-id/) queries: Search for child documents that are joined to a specific parent document.
If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then joining queries are not executed.
{: .important}
\ No newline at end of file
diff --git a/_query-dsl/joining/nested.md b/_query-dsl/joining/nested.md
new file mode 100644
index 0000000000..431a40ed1a
--- /dev/null
+++ b/_query-dsl/joining/nested.md
@@ -0,0 +1,347 @@
+---
+layout: default
+title: Nested
+parent: Joining queries
+nav_order: 30
+---
+
+# Nested query
+
+The `nested` query acts as a wrapper for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. If an object matches the search, the `nested` query returns the parent document at the root level.
+
+## Example
+
+Before you can run a `nested` query, your index must contain a [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field.
+
+To configure an example index containing nested fields, send the following request:
+
+```json
+PUT /testindex
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "age": {
+ "type": "integer"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /testindex/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, wrap your query in a `nested` query and provide the `path` to the nested field:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the matching document:
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return inner hits that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the additional `inner_hits` field. The `_nested` field identifies the specific inner object from which the inner hit originated. It contains the nested hit and the offset relative to its position in the `_source`. Because of sorting and scoring, the position of the hit objects in `inner_hits` often differs from their original location in the nested object.
+
+By default, the `_source` of the hit objects within `inner_hits` is returned relative to the `_nested` field. In this example, the `_source` within `inner_hits` contains the `name` and `age` fields as opposed to the top-level `_source`, which contains the whole `patient` object:
+
+```json
+{
+ "took": 38,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ },
+ "inner_hits": {
+ "patient": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_nested": {
+ "field": "patient",
+ "offset": 0
+ },
+ "_score": 0.2876821,
+ "_source": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+You can disable returning `_source` by configuring the `_source` field in the mappings. For more information, see [Source]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/source/).
+{: .tip}
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Multi-level nested queries
+
+You can search documents that have nested objects inside other nested objects using multi-level nested queries. In this example, you'll query multiple layers of nested fields by specifying a nested query for each level of the hierarchy.
+
+First, create an index with multi-level nested fields:
+
+```json
+PUT /patients
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "contacts": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "relationship": {
+ "type": "text"
+ },
+ "phone": {
+ "type": "keyword"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /patients/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, use a multi-level `nested` query. The following query searches for patients whose contact information includes a person named `Jane` with a relationship of `mother`:
+
+```json
+GET /patients/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "nested": {
+ "path": "patient.contacts",
+ "query": {
+ "bool": {
+ "must": [
+ { "match": { "patient.contacts.relationship": "mother" } },
+ { "match": { "patient.contacts.name": "Jane" } }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the patient who has a contact entry matching these details:
+
+```json
+{
+ "took": 14,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "patients",
+ "_id": "1",
+ "_score": 1.3862942,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `nested` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `path` | Required | Specifies the path to the nested object that you want to search. |
+| `query` | Required | The query to run on the nested objects within the specified `path`. If a nested object matches the query, the root parent document is returned. You can search nested fields using dot notation, such as `nested_object.subfield`. Multi-level nesting is supported and automatically detected. Thus, an inner `nested` query within another nested query automatically matches the correct nesting level, instead of the root. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `path` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `path` field. Default is `false`. |
+| `score_mode` | Optional | Defines how scores of matching inner documents influence the parent document's score. Valid values are:
- `avg`: Uses the average relevance score of all matching inner documents.
- `max`: Assigns the highest relevance score from the matching inner documents to the parent.
- `min`: Assigns the lowest relevance score from the matching inner documents to the parent.
- `sum`: Sums the relevance scores of all matching inner documents.
- `none`: Ignores the relevance scores of inner documents and assigns a score of `0` to the parent document.
Default is `avg`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits that matched the query. |
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/parent-id.md b/_query-dsl/joining/parent-id.md
new file mode 100644
index 0000000000..cbf86a796e
--- /dev/null
+++ b/_query-dsl/joining/parent-id.md
@@ -0,0 +1,96 @@
+---
+layout: default
+title: Parent ID
+parent: Joining queries
+nav_order: 40
+---
+
+# Parent ID query
+
+The `parent_id` query returns child documents whose parent document has the specified ID. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+## Example
+
+Before you can run a `parent_id` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for child documents of a specific parent document, use a `parent_id` query. The following query returns child documents (products) whose parent document has the ID `1`:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "parent_id": {
+ "type": "product",
+ "id": "1"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns the child product:
+
+```json
+{
+ "took": 57,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.87546873,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.87546873,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `parent_id` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `id` | Required | The ID of the parent document. The query returns child documents associated with this parent document. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
\ No newline at end of file
diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md
index 14b930cdb6..6cd534b87f 100644
--- a/_query-dsl/specialized/neural.md
+++ b/_query-dsl/specialized/neural.md
@@ -35,6 +35,8 @@ Field | Data type | Required/Optional | Description
`min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines.
+`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters).
+`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision).
#### Example request
diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md
index 42c74c0436..7dac6a9619 100644
--- a/_query-dsl/term/terms.md
+++ b/_query-dsl/term/terms.md
@@ -39,6 +39,7 @@ Parameter | Data type | Description
:--- | :--- | :---
`` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`.
## Terms lookup
@@ -250,3 +251,136 @@ Parameter | Data type | Description
`path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required.
`routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+
+## Bitmap filtering
+**Introduced 2.17**
+{: .label .label-purple }
+
+The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering.
+
+The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products.
+
+First, create a `products` index and map `product_id` as a `keyword`:
+
+```json
+PUT /products
+{
+ "mappings": {
+ "properties": {
+ "product_id": { "type": "keyword" }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index three documents that correspond to products:
+
+```json
+PUT students/_doc/1
+{
+ "name": "Product 1",
+ "product_id" : "111"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/2
+{
+ "name": "Product 2",
+ "product_id" : "222"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/3
+{
+ "name": "Product 3",
+ "product_id" : "333"
+}
+```
+{% include copy-curl.html %}
+
+To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field:
+
+```json
+PUT /customers
+{
+ "mappings": {
+ "properties": {
+ "customer_filter": {
+ "type": "binary",
+ "store": true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`.
+
+To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme:
+
+```py
+from pyroaring import BitMap
+import base64
+
+# Create a bitmap, serialize it into a byte string, and encode into Base64
+bm = BitMap([111, 222, 333]) # product ids owned by a customer
+encoded = base64.b64encode(BitMap.serialize(bm))
+
+# Convert the Base64-encoded bytes to a string for storage or transmission
+encoded_bm_str = encoded.decode('utf-8')
+
+# Print the encoded bitmap
+print(f"Encoded Bitmap: {encoded_bm_str}")
+```
+{% include copy.html %}
+
+Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer:
+
+```json
+POST customers/_doc/customer123
+{
+ "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ=="
+}
+```
+{% include copy-curl.html %}
+
+Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": {
+ "index": "customers",
+ "id": "customer123",
+ "path": "customer_filter",
+ "store": true
+ },
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==",
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
\ No newline at end of file
diff --git a/_sass/color_schemes/odfe.scss b/_sass/color_schemes/odfe.scss
deleted file mode 100644
index f9b2ca02ba..0000000000
--- a/_sass/color_schemes/odfe.scss
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// Brand colors
-//
-
-$white: #FFFFFF;
-
-$grey-dk-300: #241F21; // Error
-$grey-dk-250: mix(white, $grey-dk-300, 12.5%);
-$grey-dk-200: mix(white, $grey-dk-300, 25%);
-$grey-dk-100: mix(white, $grey-dk-300, 50%);
-$grey-dk-000: mix(white, $grey-dk-300, 75%);
-
-$grey-lt-300: #DBDBDB; // Cloud
-$grey-lt-200: mix(white, $grey-lt-300, 25%);
-$grey-lt-100: mix(white, $grey-lt-300, 50%);
-$grey-lt-000: mix(white, $grey-lt-300, 75%);
-
-$blue-300: #00007C; // Meta
-$blue-200: mix(white, $blue-300, 25%);
-$blue-100: mix(white, $blue-300, 50%);
-$blue-000: mix(white, $blue-300, 75%);
-
-$purple-300: #9600FF; // Prpl
-$purple-200: mix(white, $purple-300, 25%);
-$purple-100: mix(white, $purple-300, 50%);
-$purple-000: mix(white, $purple-300, 75%);
-
-$green-300: #00671A; // Element
-$green-200: mix(white, $green-300, 25%);
-$green-100: mix(white, $green-300, 50%);
-$green-000: mix(white, $green-300, 75%);
-
-$yellow-300: #FFDF00; // Kan-Banana
-$yellow-200: mix(white, $yellow-300, 25%);
-$yellow-100: mix(white, $yellow-300, 50%);
-$yellow-000: mix(white, $yellow-300, 75%);
-
-$red-300: #BD145A; // Ruby
-$red-200: mix(white, $red-300, 25%);
-$red-100: mix(white, $red-300, 50%);
-$red-000: mix(white, $red-300, 75%);
-
-$blue-lt-300: #0000FF; // Cascade
-$blue-lt-200: mix(white, $blue-lt-300, 25%);
-$blue-lt-100: mix(white, $blue-lt-300, 50%);
-$blue-lt-000: mix(white, $blue-lt-300, 75%);
-
-/*
-Other, unused brand colors
-
-Float #2797F4
-Firewall #0FF006B
-Hyper Pink #F261A1
-Cluster #ED20EB
-Back End #808080
-Python #25EE5C
-Warm Node #FEA501
-*/
-
-$body-background-color: $white;
-$sidebar-color: $grey-lt-000;
-$code-background-color: $grey-lt-000;
-
-$body-text-color: $grey-dk-200;
-$body-heading-color: $grey-dk-300;
-$nav-child-link-color: $grey-dk-200;
-$link-color: mix(black, $blue-lt-300, 37.5%);
-$btn-primary-color: $purple-300;
-$base-button-color: $grey-lt-000;
-
-// $border-color: $grey-dk-200;
-// $search-result-preview-color: $grey-dk-000;
-// $search-background-color: $grey-dk-250;
-// $table-background-color: $grey-dk-250;
-// $feedback-color: darken($sidebar-color, 3%);
diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss
index 3a9dcc5e6d..b3ee3c3775 100755
--- a/_sass/custom/custom.scss
+++ b/_sass/custom/custom.scss
@@ -1039,14 +1039,25 @@ body {
display: flex;
align-items: flex-start;
justify-content: center;
- gap: 20px;
- margin: 0 auto;
+ gap: 0;
+ border-top: 1px solid #eeebee;
+ flex-direction: column;
+ @include mq(md) {
+ flex-direction: row;
+ gap: 20px
+ }
}
.search-page--sidebar {
- flex: 1;
- max-width: 200px;
- flex: 0 0 200px;
+ max-width: 100%;
+ order: 2;
+ margin-top: 1rem;
+ color: $grey-dk-300;
+ @include mq(md) {
+ flex: 1;
+ max-width: 200px;
+ margin-top: 3rem;
+ }
}
.search-page--sidebar--category-filter--checkbox-child {
@@ -1054,52 +1065,96 @@ body {
}
.search-page--results {
- flex: 3;
display: flex;
flex-direction: column;
align-items: center;
- max-width: 60%;
+ width: 100%;
+ max-width: 100%;
+ order: 3;
+ @include mq(md) {
+ flex: 3;
+ max-width: 60%;
+ }
}
-.search-page--results--input {
- width: 100%;
+.search-page--results--wrapper {
position: relative;
+ display: flex;
+ width: 100%;
+ background-color: white;
+ margin: 0 auto 2rem;
+ max-width: 800px;
}
.search-page--results--input-box {
width: 100%;
- padding: 10px;
- margin-bottom: 20px;
- border: 1px solid #ccc;
+ padding: 10px 40px 10px 10px;
+ border: 1px solid $grey-lt-300;
border-radius: 4px;
+ color: $grey-dk-300;
}
.search-page--results--input-icon {
position: absolute;
- top: 35%;
- right: 10px;
- transform: translateY(-50%);
+ right: 12px;
+ align-self: center;
pointer-events: none;
- color: #333;
+ color: $grey-dk-000;
}
-.search-page--results--diplay {
+.search-page--results--display {
width: 100%;
position: relative;
flex-flow: column nowrap;
+ margin-top: 1rem;
+ @media (max-width: $content-width) {
+ margin-top: 0.5rem;
+ }
}
-.search-page--results--diplay--header {
+.search-page--results--display--header {
text-align: center;
- margin-bottom: 20px;
background-color: transparent;
+ color: $grey-dk-300;
+ margin-bottom: 1rem;
+ margin-top: 1.5rem;
+ padding-bottom: 1rem;
+ border-bottom: 1px solid $blue-dk-100;
+ font-size: 20px;
+ @include mq(md) {
+ font-size: 1.5rem;
+ }
}
-.search-page--results--diplay--container--item {
- margin-bottom: 1%;
+.search-page--results--display--container--item {
+ margin-bottom: 2rem;
display: block;
}
+.search-page--results--no-results {
+ padding: 1rem;
+ display: block;
+ font-size: 1rem;
+ font-weight: normal;
+}
+
+.search-page--results--display--container--item--link {
+ font-family: "Open Sans Condensed", Impact, "Franklin Gothic Bold", sans-serif;
+ font-size: 1.2rem;
+ font-weight: bold;
+ display: block;
+ text-decoration: underline;
+ text-underline-offset: 5px;
+ text-decoration-color: $grey-lt-300;
+ &:hover {
+ text-decoration-color: $blue-100;
+ }
+}
+
+.category-checkbox {
+ margin-right: 4px;
+}
+
@mixin body-text($color: #000) {
color: $color;
font-family: 'Open Sans';
diff --git a/_search-plugins/collapse-search.md b/_search-plugins/collapse-search.md
new file mode 100644
index 0000000000..ec7e57515a
--- /dev/null
+++ b/_search-plugins/collapse-search.md
@@ -0,0 +1,231 @@
+---
+layout: default
+title: Collapse search results
+nav_order: 3
+---
+
+# Collapse search results
+
+The `collapse` parameter groups search results by a particular field value. This returns only the top document within each group, which helps reduce redundancy by eliminating duplicates.
+
+The `collapse` parameter requires the field being collapsed to be of either a `keyword` or a `numeric` type.
+
+---
+
+## Collapsing search results
+
+To populate an index with data, define the index mappings and an `item` field indexed as a `keyword`. The following example request shows you how to define index mappings, populate an index, and then search it.
+
+#### Define index mappings
+
+```json
+PUT /bakery-items
+{
+ "mappings": {
+ "properties": {
+ "item": {
+ "type": "keyword"
+ },
+ "category": {
+ "type": "keyword"
+ },
+ "price": {
+ "type": "float"
+ },
+ "baked_date": {
+ "type": "date"
+ }
+ }
+ }
+}
+```
+
+#### Populate an index
+
+```json
+POST /bakery-items/_bulk
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" }
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-04T00:00:00Z" }
+{ "index": {} }
+{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" }
+```
+
+#### Search the index, returning all results
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "sort": ["price"]
+}
+```
+
+This query returns the uncollapsed search results, showing all documents, including both entries for "Chocolate Cake".
+
+#### Search the index and collapse the results
+
+To group search results by the `item` field and sort them by `price`, you can use the following query:
+
+**Collapsed `item` field search results**
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item"
+ },
+ "sort": ["price"]
+}
+```
+
+**Response**
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 4,
+ "relation": "eq"
+ },
+ "max_score": null,
+ "hits": [
+ {
+ "_index": "bakery-items",
+ "_id": "mISga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Vanilla Cake",
+ "category": "cakes",
+ "price": 12,
+ "baked_date": "2023-07-02T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Vanilla Cake"
+ ]
+ },
+ "sort": [
+ 12
+ ]
+ },
+ {
+ "_index": "bakery-items",
+ "_id": "loSga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Chocolate Cake",
+ "category": "cakes",
+ "price": 15,
+ "baked_date": "2023-07-01T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Chocolate Cake"
+ ]
+ },
+ "sort": [
+ 15
+ ]
+ }
+ ]
+ }
+}
+```
+
+The collapsed search results will show only one "Chocolate Cake" entry, demonstrating how the `collapse` parameter reduces redundancy.
+
+The `collapse` parameter affects only the top search results and does not change any aggregation results. The total number of hits shown in the response reflects all matching documents before the parameter is applied, including duplicates. However, the response doesn't indicate the exact number of unique groups formed by the operation.
+
+---
+
+## Expanding collapsed results
+
+You can expand each collapsed top hit with the `inner_hits` property.
+
+The following example request applies `inner_hits` to retrieve the lowest-priced and most recent item, for each type of cake:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 1,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+```
+
+### Multiple inner hits for each collapsed hit
+
+To obtain several groups of inner hits for each collapsed result, you can set different criteria for each group. For example, lets request the three most recent items for every bakery item:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 3,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+
+```
+This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order.
+
+You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size.
+
diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md
index cbbb993ac9..80614e2fff 100644
--- a/_search-plugins/concurrent-segment-search.md
+++ b/_search-plugins/concurrent-segment-search.md
@@ -22,6 +22,8 @@ Without concurrent segment search, Lucene executes a request sequentially across
## Enabling concurrent segment search at the index or cluster level
+Starting with OpenSearch version 2.17, you can use the `search.concurrent_segment_search.mode` setting to configure concurrent segment search on your cluster. The existing `search.concurrent_segment_search.enabled` setting will be deprecated in future version releases in favor of the new setting.
+
By default, concurrent segment search is disabled on the cluster. You can enable concurrent segment search at two levels:
- Cluster level
@@ -30,8 +32,37 @@ By default, concurrent segment search is disabled on the cluster. You can enable
The index-level setting takes priority over the cluster-level setting. Thus, if the cluster setting is enabled but the index setting is disabled, then concurrent segment search will be disabled for that index. Because of this, the index-level setting is not evaluated unless it is explicitly set, regardless of the default value configured for the setting. You can retrieve the current value of the index-level setting by calling the [Index Settings API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/get-settings/) and omitting the `?include_defaults` query parameter.
{: .note}
-To enable concurrent segment search for all indexes in the cluster, set the following dynamic cluster setting:
+Both the cluster- and index-level `search.concurrent_segment_search.mode` settings accept the following values:
+
+- `all`: Enables concurrent segment search across all search requests. This is equivalent to setting `search.concurrent_segment_search.enabled` to `true`.
+
+- `none`: Disables concurrent segment search for all search requests, effectively turning off the feature. This is equivalent to setting `search.concurrent_segment_search.enabled` to `false`. This is the **default** behavior.
+
+- `auto`: In this mode, OpenSearch will use the pluggable _concurrent search decider_ to decide whether to use a concurrent or sequential path for the search request based on the query evaluation and the presence of aggregations in the request. By default, if there are no deciders configured by any plugin, then the decision to use concurrent search will be made based on the presence of aggregations in the request. For more information about the pluggable decider semantics, see [Pluggable concurrent search deciders](#pluggable-concurrent-search-deciders-concurrentsearchrequestdecider).
+
+To enable concurrent segment search for all search requests across every index in the cluster, send the following request:
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.mode": "all"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To enable concurrent segment search for all search requests on a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.mode": "all"
+}
+```
+{% include copy-curl.html %}
+
+You can continue to use the existing `search.concurrent_segment_search.enabled` setting to enable concurrent segment search for all indexes in the cluster as follows:
```json
PUT _cluster/settings
{
@@ -52,6 +83,35 @@ PUT /_settings
```
{% include copy-curl.html %}
+
+When evaluating whether concurrent segment search is enabled on a cluster, the `search.concurrent_segment_search.mode` setting takes precedence over the `search.concurrent_segment_search.enabled` setting.
+If the `search.concurrent_segment_search.mode` setting is not explicitly set, then the `search.concurrent_segment_search.enabled` setting will be evaluated to determine whether to enable concurrent segment search.
+
+When upgrading a cluster from an earlier version that specifies the older `search.concurrent_segment_search.enabled` setting, this setting will continue to be honored. However, once the `search.concurrent_segment_search.mode` is set, it will override the previous setting, enabling or disabling concurrent search based on the specified mode.
+We recommend setting `search.concurrent_segment_search.enabled` to `null` on your cluster once you configure `search.concurrent_segment_search.mode`:
+
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.enabled": null
+ }
+}
+```
+{% include copy-curl.html %}
+
+To disable the old setting for a particular index, specify the index name in the endpoint:
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.enabled": null
+}
+```
+{% include copy-curl.html %}
+
+
+
+
## Slicing mechanisms
You can choose one of two available mechanisms for assigning segments to slices: the default [Lucene mechanism](#the-lucene-mechanism) or the [max slice count mechanism](#the-max-slice-count-mechanism).
@@ -66,7 +126,10 @@ The _max slice count_ mechanism is an alternative slicing mechanism that uses a
### Setting the slicing mechanism
-By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request. To use the max slice count mechanism instead, configure the `search.concurrent.max_slice_count` cluster setting:
+By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request.
+To use the max slice count mechanism instead, you can set the slice count for concurrent segment search at either the cluster level or index level.
+
+To configure the slice count for all indexes in a cluster, use the following dynamic cluster setting:
```json
PUT _cluster/settings
@@ -78,7 +141,17 @@ PUT _cluster/settings
```
{% include copy-curl.html %}
-The `search.concurrent.max_slice_count` setting can take the following valid values:
+To configure the slice count for a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent.max_slice_count": 2
+}
+```
+{% include copy-curl.html %}
+
+Both the cluster- and index-level `search.concurrent.max_slice_count` settings can take the following valid values:
- `0`: Use the default Lucene mechanism.
- Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient.
@@ -117,8 +190,20 @@ Non-concurrent search calculates the document count error and returns it in the
For more information about how `shard_size` can affect both `doc_count_error_upper_bound` and collected buckets, see [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11680#issuecomment-1885882985).
-## Developer information: AggregatorFactory changes
+## Developer information
+
+The following sections provide additional information for developers.
+
+### AggregatorFactory changes
+
+Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+
+To ensure that a custom plugin-based `Aggregator` implementation functions with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method to return `true`.
+
+### Pluggable concurrent search deciders: ConcurrentSearchRequestDecider
-Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+Introduced 2.17
+{: .label .label-purple }
-To ensure that a custom plugin-based `Aggregator` implementation works with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method to return `true`.
+Plugin developers can customize the concurrent search decision-making for `auto` mode by extending [`ConcurrentSearchRequestDecider`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java) and registering its factory through [`SearchPlugin#getConcurrentSearchRequestFactories()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/plugins/SearchPlugin.java#L148). The deciders are evaluated only if a request does not belong to any category listed in the [Limitations](#limitations) and [Other considerations](#other-considerations) sections. For more information about the decider implementation, see [the corresponding GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/15259).
+The search request is parsed using a `QueryBuilderVisitor`, which calls the [`ConcurrentSearchRequestDecider#evaluateForQuery()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L36) method of all the configured deciders for every node of the `QueryBuilder` tree in the search request. The final concurrent search decision is obtained by combining the decision from each decider returned by the [`ConcurrentSearchRequestDecider#getConcurrentSearchDecision()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L44) method.
\ No newline at end of file
diff --git a/_search-plugins/knn/api.md b/_search-plugins/knn/api.md
index c7314f7ae2..d927bf1c35 100644
--- a/_search-plugins/knn/api.md
+++ b/_search-plugins/knn/api.md
@@ -185,7 +185,7 @@ This API operation only works with indexes created using the `nmslib` and `faiss
The following request evicts the native library indexes of three indexes from the cache:
```json
-GET /_plugins/_knn/clear_cache/index1,index2,index3?pretty
+POST /_plugins/_knn/clear_cache/index1,index2,index3?pretty
{
"_shards" : {
"total" : 6,
@@ -200,7 +200,7 @@ The `total` parameter indicates the number of shards that the API attempted to c
The k-NN clear cache API can be used with index patterns to clear one or more indexes that match the given pattern from the cache, as shown in the following example:
```json
-GET /_plugins/_knn/clear_cache/index*?pretty
+POST /_plugins/_knn/clear_cache/index*?pretty
{
"_shards" : {
"total" : 6,
@@ -234,7 +234,7 @@ Response field | Description
`timestamp` | The date and time when the model was created.
`description` | A user-provided description of the model.
`error` | An error message explaining why the model is in a failed state.
-`space_type` | The space type for which this model is trained, for example, Euclidean or cosine.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note - this value can be set in the top-level of the request as well
`dimension` | The dimensionality of the vector space for which this model is designed.
`engine` | The native library used to create the model, either `faiss` or `nmslib`.
@@ -351,6 +351,7 @@ Request parameter | Description
`search_size` | The training data is pulled from the training index using scroll queries. This parameter defines the number of results to return per scroll query. Default is `10000`. Optional.
`description` | A user-provided description of the model. Optional.
`method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [k-NN index method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). The method requires training to be valid.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note: This value can also be set in the `method` parameter.
#### Usage
@@ -365,10 +366,10 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
@@ -395,10 +396,10 @@ POST /_plugins/_knn/models/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md
index e9cff8562f..f8921033e0 100644
--- a/_search-plugins/knn/approximate-knn.md
+++ b/_search-plugins/knn/approximate-knn.md
@@ -49,9 +49,9 @@ PUT my-knn-index-1
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -62,9 +62,9 @@ PUT my-knn-index-1
"my_vector2": {
"type": "knn_vector",
"dimension": 4,
+ "space_type": "innerproduct",
"method": {
"name": "hnsw",
- "space_type": "innerproduct",
"engine": "faiss",
"parameters": {
"ef_construction": 256,
@@ -199,10 +199,10 @@ POST /_plugins/_knn/models/my-model/_train
"training_field": "train-field",
"dimension": 4,
"description": "My model description",
+ "space_type": "l2",
"method": {
"name": "ivf",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"nlist": 4,
"nprobes": 2
@@ -308,6 +308,72 @@ Engine | Notes
:--- | :---
`faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index.
+### Rescoring quantized results using full precision
+
+Quantization can be used to significantly reduce the memory footprint of a k-NN index. For more information about quantization, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization). Because some vector representation is lost during quantization, the computed distances will be approximate. This causes the overall recall of the search to decrease.
+
+To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k.
+
+The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field:
+
+- For `in_memory` mode, no rescoring is applied by default.
+- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table.
+
+| Compression level | Default rescore `oversample_factor` |
+|:------------------|:----------------------------------|
+| `32x` (default) | 3.0 |
+| `16x` | 2.0 |
+| `8x` | 2.0 |
+| `4x` | No default rescoring |
+| `2x` | No default rescoring |
+
+To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : {
+ "oversample_factor": 1.2
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Alternatively, set the `rescore` parameter to `true` to use a default `oversample_factor` of `1.0`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000.
+
+Rescoring is only supported for the `faiss` engine.
+
+Rescoring is not needed if quantization is not used because the scores returned are already fully precise.
+{: .note}
+
### Using approximate k-NN with filters
To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/).
@@ -322,7 +388,7 @@ To learn more about the radial search feature, see [k-NN radial search]({{site.u
### Using approximate k-NN with binary vectors
-To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## Spaces
@@ -346,5 +412,5 @@ The cosine similarity formula does not include the `1 -` prefix. However, becaus
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_search-plugins/knn/disk-based-vector-search.md
new file mode 100644
index 0000000000..dfb9262db5
--- /dev/null
+++ b/_search-plugins/knn/disk-based-vector-search.md
@@ -0,0 +1,193 @@
+---
+layout: default
+title: Disk-based vector search
+nav_order: 16
+parent: k-NN search
+has_children: false
+---
+
+# Disk-based vector search
+**Introduced 2.17**
+{: .label .label-purple}
+
+For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall.
+
+To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage.
+
+## Creating an index for disk-based vector search
+
+To create an index for disk-based vector search, send the following request:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk.
+
+To reduce the compression level, provide the `compression_level` parameter when creating the index mapping:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used.
+{: .note}
+
+If you need more granular fine-tuning, you can override additional k-NN parameters in the method definition. For example, to improve recall, increase the `ef_construction` parameter value:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "method": {
+ "params": {
+ "ef_construction": 512
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `on_disk` mode only works with the `float` data type.
+{: .note}
+
+## Ingestion
+
+You can perform document ingestion for a disk-optimized vector index in the same way as for a regular vector index. To index several documents in bulk, send the following request:
+
+```json
+POST _bulk
+{ "index": { "_index": "my-vector-index", "_id": "1" } }
+{ "my_vector_field": [1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5], "price": 12.2 }
+{ "index": { "_index": "my-vector-index", "_id": "2" } }
+{ "my_vector_field": [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5], "price": 7.1 }
+{ "index": { "_index": "my-vector-index", "_id": "3" } }
+{ "my_vector_field": [3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5], "price": 12.9 }
+{ "index": { "_index": "my-vector-index", "_id": "4" } }
+{ "my_vector_field": [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5], "price": 1.2 }
+{ "index": { "_index": "my-vector-index", "_id": "5" } }
+{ "my_vector_field": [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], "price": 3.7 }
+{ "index": { "_index": "my-vector-index", "_id": "6" } }
+{ "my_vector_field": [6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5], "price": 10.3 }
+{ "index": { "_index": "my-vector-index", "_id": "7" } }
+{ "my_vector_field": [7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5], "price": 5.5 }
+{ "index": { "_index": "my-vector-index", "_id": "8" } }
+{ "my_vector_field": [8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5], "price": 4.4 }
+{ "index": { "_index": "my-vector-index", "_id": "9" } }
+{ "my_vector_field": [9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5], "price": 8.9 }
+```
+{% include copy-curl.html %}
+
+## Search
+
+Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results using full precision]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). To perform vector search on a disk-optimized index, provide the search vector:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Similarly to other index configurations, you can override k-NN parameters in the search request:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5,
+ "method_parameters": {
+ "ef_search": 512
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+[Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/) does not support disk-based vector search.
+{: .note}
+
+## Model-based indexes
+
+For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#supported-faiss-methods) and a compression level of `32x`. To run the training API, send the following request:
+
+```json
+POST /_plugins/_knn/models/test-model/_train
+{
+ "training_index": "train-index-name",
+ "training_field": "train-field-name",
+ "dimension": 8,
+ "max_training_vector_count": 1200,
+ "search_size": 100,
+ "description": "My model",
+ "space_type": "innerproduct",
+ "mode": "on_disk"
+}
+```
+{% include copy-curl.html %}
+
+This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model).
+{: .note}
+
+You can override the `compression_level` for disk-optimized indexes in the same way as for regular k-NN indexes.
+
+
+## Next steps
+
+- For more information about binary quantization, see [Binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization).
+- For more information about k-NN vector workload modes, see [Vector workload modes]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes).
\ No newline at end of file
diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md
index a6ffd922eb..620b262cf9 100644
--- a/_search-plugins/knn/knn-index.md
+++ b/_search-plugins/knn/knn-index.md
@@ -25,9 +25,9 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 128,
@@ -41,17 +41,17 @@ PUT /test-index
```
{% include copy-curl.html %}
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine to reduce the amount of storage space needed. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with k-NN plugin version 2.17, you can use `byte` vectors with the `faiss` and `lucene` engines to reduce the amount of required memory and storage space. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
-## Binary vector
+## Binary vectors
-Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## SIMD optimization for the Faiss engine
-Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency.
+Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, the k-NN plugin supports AVX512 SIMD instructions on x64 architecture.
SIMD optimization is applicable only if the vector dimension is a multiple of 8.
{: .note}
@@ -60,14 +60,22 @@ SIMD optimization is applicable only if the vector dimension is a multiple of 8.
### x64 architecture
-For the x64 architecture, two different versions of the Faiss library are built and shipped with the artifact:
+For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact:
- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions.
-- `libopensearchknn_faiss_avx2.so`: The Faiss library that contains AVX2 SIMD instructions.
+- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX512 SIMD instructions.
+- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions.
-If your hardware supports AVX2, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+When using the Faiss library, the performance ranking is as follows: AVX512 > AVX2 > no optimization.
+{: .note }
+
+If your hardware supports AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx512.so` library at runtime.
+
+If your hardware supports AVX2 but doesn't support AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+
+To disable the AVX512 and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512.disabled` and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, both of these are `false`).
-To disable AVX2 and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx2.disabled` static setting as `true` in `opensearch.yml` (default is `false`). Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
+Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
### ARM64 architecture
@@ -83,7 +91,7 @@ A method definition will always contain the name of the method, the space_type t
Mapping parameter | Required | Default | Updatable | Description
:--- | :--- | :--- | :--- | :---
`name` | true | n/a | false | The identifier for the nearest neighbor method.
-`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors.
+`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping.
`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene.
`parameters` | false | null | false | The parameters used for the nearest neighbor method.
@@ -116,7 +124,7 @@ Method name | Requires training | Supported spaces | Description
For hnsw, "innerproduct" is not available when PQ is used.
{: .note}
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
#### HNSW parameters
@@ -168,7 +176,6 @@ An index created in OpenSearch version 2.11 or earlier will still use the old `e
"method": {
"name":"hnsw",
"engine":"lucene",
- "space_type": "l2",
"parameters":{
"m":2048,
"ef_construction": 245
@@ -186,7 +193,6 @@ The following example method definition specifies the `hnsw` method and a `pq` e
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -232,7 +238,6 @@ The following example uses the `ivf` method without specifying an encoder (by d
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist": 4,
"nprobes": 2
@@ -246,7 +251,6 @@ The following example uses the `ivf` method with a `pq` encoder:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -265,7 +269,6 @@ The following example uses the `hnsw` method without specifying an encoder (by d
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"ef_construction": 256,
"m": 8
@@ -279,7 +282,6 @@ The following example uses the `hnsw` method with an `sq` encoder of type `fp16`
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -300,7 +302,6 @@ The following example uses the `ivf` method with an `sq` encoder of type `fp16`:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -324,7 +325,7 @@ If you want to use less memory and increase indexing speed as compared to HNSW w
If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop.
-You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
+You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
### Memory estimation
diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md
index d2fd883e74..a184de2d3d 100644
--- a/_search-plugins/knn/knn-score-script.md
+++ b/_search-plugins/knn/knn-score-script.md
@@ -302,5 +302,5 @@ Cosine similarity returns a number between -1 and 1, and because OpenSearch rele
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ... ]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md
index 656ce72fd2..a911dc91c9 100644
--- a/_search-plugins/knn/knn-vector-quantization.md
+++ b/_search-plugins/knn/knn-vector-quantization.md
@@ -11,15 +11,15 @@ has_math: true
By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization.
-OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, and product quantization (PQ).
+OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, product quantization (PQ), and binary quantization(BQ).
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the Lucene engine in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with version 2.17, the k-NN plugin supports `byte` vectors with the `faiss` and `lucene` engines in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
## Lucene scalar quantization
-Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike the [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector), which requires you to quantize vectors before ingesting the documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
+Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors.
@@ -40,10 +40,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -85,10 +85,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -115,7 +115,7 @@ In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use
#### HNSW memory estimation
-The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * M)` bytes/vector, where `M` is the maximum number of bidirectional links created for each element during the construction of the graph.
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
@@ -150,10 +150,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -194,10 +194,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -250,9 +250,9 @@ In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer
#### HNSW memory estimation
-The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * M)` bytes/vector.
+The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
-As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows:
```r
1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB
@@ -260,9 +260,9 @@ As an example, assume that you have 1 million vectors with a dimension of 256 an
#### IVF memory estimation
-The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * d))` bytes/vector.
+The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into.
-As an example, assume that you have 1 million vectors with a dimension of 256 and `nlist` of 128. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows:
```r
1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB
@@ -310,3 +310,175 @@ For example, assume that you have 1 million vectors with a dimension of 256, `iv
```r
1.1*((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB
```
+
+## Binary quantization
+
+Starting with version 2.17, OpenSearch supports BQ with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ.
+
+### Using BQ
+To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization:
+
+- **32x compression** for 1-bit quantization
+- **16x compression** for 2-bit quantization
+- **8x compression** for 4-bit quantization
+
+This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency.
+
+To specify the compression level, set the `compression_level` parameter:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x",
+ "method": {
+ "params": {
+ "ef_construction": 16
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`):
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "method": {
+ "name": "hnsw",
+ "engine": "faiss",
+ "space_type": "l2",
+ "params": {
+ "m": 16,
+ "ef_construction": 512,
+ "encoder": {
+ "name": "binary",
+ "parameters": {
+ "bits": 1
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Search using binary quantized vectors
+
+You can perform a k-NN search on your index by providing a vector and specifying the number of nearest neighbors (k) to return:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters.
+The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results.
+
+The following request specifies the `ef_search` and `oversample_factor` parameters:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10,
+ "method_parameters": {
+ "ef_search": 10
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+
+#### HNSW memory estimation
+
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
+
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values.
+
+##### 1-bit quantization (32x compression)
+
+In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000
+ ~= 0.176 GB
+```
+
+##### 2-bit quantization (16x compression)
+
+In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000
+ ~= 0.211 GB
+```
+
+##### 4-bit quantization (8x compression)
+
+In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000
+ ~= 0.282 GB
+```
diff --git a/_search-plugins/knn/nested-search-knn.md b/_search-plugins/knn/nested-search-knn.md
index d947ebc6e6..bbba6c9c1e 100644
--- a/_search-plugins/knn/nested-search-knn.md
+++ b/_search-plugins/knn/nested-search-knn.md
@@ -38,9 +38,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
@@ -324,9 +324,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md
index cc27776fc4..7a8d9fec7b 100644
--- a/_search-plugins/knn/painless-functions.md
+++ b/_search-plugins/knn/painless-functions.md
@@ -55,7 +55,7 @@ l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This functi
cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:
`float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)`
In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score.
hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance.
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
## Constraints
diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md
index 123b1daef1..77f44dee93 100644
--- a/_search-plugins/knn/performance-tuning.md
+++ b/_search-plugins/knn/performance-tuning.md
@@ -59,9 +59,9 @@ The `_source` field contains the original JSON document body that was passed at
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
@@ -85,9 +85,9 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
diff --git a/_search-plugins/knn/radial-search-knn.md b/_search-plugins/knn/radial-search-knn.md
index 1a4a223294..e5449a0993 100644
--- a/_search-plugins/knn/radial-search-knn.md
+++ b/_search-plugins/knn/radial-search-knn.md
@@ -53,9 +53,9 @@ PUT knn-index-test
"my_vector": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md
index 1b9aa3608c..e4731ec94c 100644
--- a/_search-plugins/knn/settings.md
+++ b/_search-plugins/knn/settings.md
@@ -27,6 +27,7 @@ Setting | Static/Dynamic | Default | Description
`knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability.
`knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap.
`knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
+`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
## Index settings
diff --git a/_search-plugins/search-pipelines/using-search-pipeline.md b/_search-plugins/search-pipelines/using-search-pipeline.md
index ecb988ad11..b6dbbdc5d0 100644
--- a/_search-plugins/search-pipelines/using-search-pipeline.md
+++ b/_search-plugins/search-pipelines/using-search-pipeline.md
@@ -17,14 +17,45 @@ You can use a search pipeline in the following ways:
## Specifying an existing search pipeline for a request
-After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query by specifying the pipeline name in the `search_pipeline` query parameter:
+After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query in the following ways. For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+
+### Specifying the pipeline in a query parameter
+
+You can specify the pipeline name in the `search_pipeline` query parameter as follows:
```json
GET /my_index/_search?search_pipeline=my_pipeline
```
{% include copy-curl.html %}
-For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+### Specifying the pipeline in the request body
+
+You can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /my-index/_search
+{
+ "query": {
+ "match_all": {}
+ },
+ "from": 0,
+ "size": 10,
+ "search_pipeline": "my_pipeline"
+}
+```
+{% include copy-curl.html %}
+
+For multi-search, you can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /_msearch
+{ "index": "test"}
+{ "query": { "match_all": {} }, "from": 0, "size": 10, "search_pipeline": "my_pipeline"}
+{ "index": "test-1", "search_type": "dfs_query_then_fetch"}
+{ "query": { "match_all": {} }, "search_pipeline": "my_pipeline1" }
+
+```
+{% include copy-curl.html %}
## Using a temporary search pipeline for a request
diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md
index 395e9e748a..38fc7a491d 100644
--- a/_search-plugins/searching-data/inner-hits.md
+++ b/_search-plugins/searching-data/inner-hits.md
@@ -139,8 +139,8 @@ The preceding query searches for nested user objects containing the name John an
}
}
```
-## Inner hits with parent-child objects
-Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent-child objects.
+## Inner hits with parent/child objects
+Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent/child objects.
1. Create an index with a parent-join field:
@@ -806,4 +806,8 @@ The following is the expected result:
Using `inner_hits` provides contextual relevance by showing exactly which nested or child documents match the query criteria. This is crucial for applications in which the relevance of results depends on a specific part of the document that matches the query.
- Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
\ No newline at end of file
+ Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
+
+## Next steps
+
+- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields.
\ No newline at end of file
diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md
index cd893f4144..f19030bf90 100644
--- a/_search-plugins/vector-search.md
+++ b/_search-plugins/vector-search.md
@@ -37,9 +37,9 @@ PUT test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 1024,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -57,7 +57,7 @@ PUT test-index
You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float.
-To save storage space, you can use `byte` or `binary` vectors. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector) and [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
### k-NN vector search
@@ -131,9 +131,9 @@ PUT /hotels-index
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md
index 366bc2674c..b26063bed0 100644
--- a/_security-analytics/threat-intelligence/getting-started.md
+++ b/_security-analytics/threat-intelligence/getting-started.md
@@ -50,15 +50,64 @@ Local files uploaded as the threat intelligence source must use the following sp
When using the `S3_SOURCE` as a remote store, the following connection information must be provided:
-- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role.
-- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored.
-- **Specify a directory or file**: The object key or directory path for the `STIX2` file in the S3 bucket.
+- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. When using the AWS OpenSearch Service, the role ARN needs to be in the same account as the OpenSearch domain. For more information about adding a new role for the AWS OpenSearch Service, see [Add service ARN](#add-aws-opensearch-service-arn).
+- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. To access an S3 bucket in a different AWS account, see the [Cross-account S3 bucket connection](#cross-account-s3-bucket-connection) section for more details.
+- **Specify a file**: The object key for the `STIX2` file in the S3 bucket.
- **Region**: The AWS Region for the S3 bucket.
You can also set the **Download schedule**, which determines to where OpenSearch downloads an updated `STIX2` file from the connected S3 bucket. The default interval is once a day. Only daily intervals are supported.
Alternatively, you can check the **Download on demand** option, which prevents new data from the bucket from being automatically downloaded.
+#### Add AWS OpenSearch Service ARN
+
+If you're using the AWS OpenSearch Service, create a new ARN role with a custom trust policy. For instructions on how to create the role, see [Creating a role for an AWS service](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-service.html#roles-creatingrole-service-console).
+
+When creating the role, customize the following settings:
+
+- Add the following custom trust policy:
+
+ ```bash
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "Service": [
+ "opensearchservice.amazonaws.com"
+ ]
+ },
+ "Action": "sts:AssumeRole"
+ }
+ ]
+ }
+ ```
+
+- On the Permissions policies page, add the `AmazonS3ReadOnlyAccess` permission.
+
+
+#### Cross-account S3 bucket connection
+
+Because the role ARN needs to be in the same account as the OpenSearch domain, a trust policy needs to be configured that allows the OpenSearch domain to download from S3 buckets from the same account.
+
+To download from an S3 bucket in another account, the trust policy for that bucket needs to give the role ARN permission to read from the object, as shown in the following example:
+
+```
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "AWS": "arn:aws:iam::123456789012:role/account-1-threat-intel-role"
+ },
+ "Action": "s3:*",
+ "Resource": "arn:aws:s3:::account-2-threat-intel-bucket/*"
+ }
+ ]
+}
+```
## Step 2: Set up scanning for your log sources
diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md
index 352fe06a61..b17b60e147 100644
--- a/_security/access-control/document-level-security.md
+++ b/_security/access-control/document-level-security.md
@@ -13,6 +13,8 @@ Document-level security lets you restrict a role to a subset of documents in an
![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png)
+The maximum size for the document-level security configuration is 1024 KB (1,048,404 characters).
+{: .warning}
## Simple roles
diff --git a/_security/audit-logs/index.md b/_security/audit-logs/index.md
index becb001ec0..8eeea33447 100644
--- a/_security/audit-logs/index.md
+++ b/_security/audit-logs/index.md
@@ -224,3 +224,36 @@ plugins.security.audit.config.threadpool.max_queue_len: 100000
To disable audit logs after they've been enabled, remove the `plugins.security.audit.type: internal_opensearch` setting from `opensearch.yml`, or switch off the **Enable audit logging** check box in OpenSearch Dashboards.
+## Audit user account manipulation
+
+To enable audit logging on changes to a security index, such as changes to roles mappings and role creation or deletion, use the following settings in the `compliance:` portion of the audit log configuration, as shown in the following example:
+
+```
+_meta:
+ type: "audit"
+ config_version: 2
+
+config:
+ # enable/disable audit logging
+ enabled: true
+
+ ...
+
+
+ compliance:
+ # enable/disable compliance
+ enabled: true
+
+ # Log updates to internal security changes
+ internal_config: true
+
+ # Log only metadata of the document for write events
+ write_metadata_only: false
+
+ # Log only diffs for document updates
+ write_log_diffs: true
+
+ # List of indices to watch for write events. Wildcard patterns are supported
+ # write_watched_indices: ["twitter", "logs-*"]
+ write_watched_indices: [".opendistro_security"]
+```
diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md
index 3f28dfecfd..6c7311e7dc 100644
--- a/_security/authentication-backends/jwt.md
+++ b/_security/authentication-backends/jwt.md
@@ -117,7 +117,7 @@ The following table lists the configuration parameters.
Name | Description
:--- | :---
-`signing_key` | The signing key to use when verifying the token. If you use a symmetric key algorithm, it is the base64-encoded shared secret. If you use an asymmetric algorithm, it contains the public key.
+`signing_key` | The signing key(s) used to verify the token. If you use a symmetric key algorithm, this is the Base64-encoded shared secret. If you use an asymmetric algorithm, the algorithm contains the public key. To pass multiple keys, use a comma-separated list or enumerate the keys.
`jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer `. Default is `Authorization`. Replacing this field with a value other than `Authorization` prevents the audit log from properly redacting the JWT header from audit messages. It is recommended that users only use `Authorization` when using JWTs with audit logging.
`jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here.
`subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used.
diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md
index 811fd2a69f..38bcc01cdd 100755
--- a/_security/configuration/disable-enable-security.md
+++ b/_security/configuration/disable-enable-security.md
@@ -155,22 +155,22 @@ Use the following steps to reinstall the plugin:
1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted:
- ```json
- curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
- "transient": {
- "cluster.routing.allocation.enable": "none"
- }
- }'
- ```
- {% include copy.html %}
+ ```json
+ curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
+ "transient": {
+ "cluster.routing.allocation.enable": "none"
+ }
+ }'
+ ```
+ {% include copy.html %}
2. Install the Security plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install):
- ```bash
- bin/opensearch-plugin install opensearch-security
- ```
- {% include copy.html %}
-
+ ```bash
+ bin/opensearch-plugin install opensearch-security
+ ```
+ {% include copy.html %}
+
3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See
[Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured.
diff --git a/_security/configuration/index.md b/_security/configuration/index.md
index e351e8865f..f68667d92d 100644
--- a/_security/configuration/index.md
+++ b/_security/configuration/index.md
@@ -3,7 +3,7 @@ layout: default
title: Configuration
nav_order: 2
has_children: true
-has_toc: false
+has_toc: true
redirect_from:
- /security-plugin/configuration/
- /security-plugin/configuration/index/
@@ -11,21 +11,105 @@ redirect_from:
# Security configuration
-The plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch in a production environment, you must configure it manually:
+The Security plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch with the Security plugin in a production environment, you must make changes to the demo certificates and other configuration options manually.
-1. [Replace the demo certificates]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
-1. [Reconfigure `opensearch.yml` to use your certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls).
-1. [Reconfigure `config.yml` to use your authentication backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/) (if you don't plan to use the internal user database).
-1. [Modify the configuration YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml).
-1. If you plan to use the internal user database, [set a password policy in `opensearch.yml`]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#opensearchyml).
-1. [Apply changes using the `securityadmin` script]({{site.url}}{{site.baseurl}}/security/configuration/security-admin).
-1. Start OpenSearch.
-1. [Add users, roles, role mappings, and tenants]({{site.url}}{{site.baseurl}}/security/access-control/index/).
+## Replace the demo certificates
-If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+OpenSearch ships with demo certificates intended for quick setup and demonstration purposes. For a production environment, it's critical to replace these with your own trusted certificates, using the following steps, to ensure secure communication:
-The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that use kibana in their names. We will change these names in a future release.
+1. **Generate your own certificates:** Use tools like OpenSSL or a certificate authority (CA) to generate your own certificates. For more information about generating certificates with OpenSSL, see [Generating self-signed certificates]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/).
+2. **Store the generated certificates and private key in the appropriate directory:** Generated certificates are typically stored in `/config/`. For more information, see [Add certificate files to opensearch.yml]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/#add-certificate-files-to-opensearchyml).
+3. **Set the following file permissions:**
+ - Private key (.key files): Set the file mode to `600`. This restricts access so that only the file owner (the OpenSearch user) can read and write to the file, ensuring that the private key remains secure and inaccessible to unauthorized users.
+ - Public certificates (.crt, .pem files): Set the file mode to `644`. This allows the file owner to read and write to the file, while other users can only read it.
+
+For additional guidance on file modes, see the following table.
+
+ | Item | Sample | Numeric | Bitwise |
+ |-------------|---------------------|---------|--------------|
+ | Public key | `~/.ssh/id_rsa.pub` | `644` | `-rw-r--r--` |
+ | Private key | `~/.ssh/id_rsa` | `600` | `-rw-------` |
+ | SSH folder | `~/.ssh` | `700` | `drwx------` |
+
+For more information, see [Configuring basic security settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
+
+## Reconfigure `opensearch.yml` to use your certificates
+
+The `opensearch.yml` file is the main configuration file for OpenSearch; you can find the file at `/config/opensearch.yml`. Use the following steps to update this file to point to your custom certificates:
+
+In `opensearch.yml`, set the correct paths for your certificates and keys, as shown in the following example:
+ ```
+ plugins.security.ssl.transport.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.transport.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.transport.pemtrustedcas_filepath: /path/to/your/ca.pem
+ plugins.security.ssl.http.enabled: true
+ plugins.security.ssl.http.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.http.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.http.pemtrustedcas_filepath: /path/to/your/ca.pem
+ ```
+For more information, see [Configuring TLS certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/).
+
+## Reconfigure `config.yml` to use your authentication backend
+
+The `config.yml` file allows you to configure the authentication and authorization mechanisms for OpenSearch. Update the authentication backend settings in `/config/opensearch-security/config.yml` according to your requirements.
+
+For example, to use LDAP as your authentication backend, add the following settings:
+
+ ```
+ authc:
+ basic_internal_auth:
+ http_enabled: true
+ transport_enabled: true
+ order: 1
+ http_authenticator:
+ type: basic
+ challenge: true
+ authentication_backend:
+ type: internal
+ ```
+For more information, see [Configuring the Security backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/).
+
+## Modify the configuration YAML files
+
+Determine whether any additional YAML files need modification, for example, the `roles.yml`, `roles_mapping.yml`, or `internal_users.yml` files. Update the files with any additional configuration information. For more information, see [Modifying the YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/).
+
+## Set a password policy
+
+When using the internal user database, we recommend enforcing a password policy to ensure that strong passwords are used. For information about strong password policies, see [Password settings]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#password-settings).
+
+## Apply changes using the `securityadmin` script
+
+The following steps do not apply to first-time users because the security index is automatically initialized from the YAML configuration files when OpenSearch starts.
+{: .note}
+
+After initial setup, if you make changes to your security configuration or disable automatic initialization by setting `plugins.security.allow_default_init_securityindex` to `false` (which prevents security index initialization from `yaml` files), you need to manually apply changes using the `securityadmin` script:
+
+1. Find the `securityadmin` script. The script is typically stored in the OpenSearch plugins directory, `plugins/opensearch-security/tools/securityadmin.[sh|bat]`.
+ - Note: If you're using OpenSearch 1.x, the `securityadmin` script is located in the `plugins/opendistro_security/tools/` directory.
+ - For more information, see [Basic usage](https://opensearch.org/docs/latest/security/configuration/security-admin/#basic-usage).
+2. Run the script by using the following command:
+ ```
+ ./plugins/opensearch-security/tools/securityadmin.[sh|bat]
+ ```
+3. Check the OpenSearch logs and configuration to ensure that the changes have been successfully applied.
+
+For more information about using the `securityadmin` script, see [Applying changes to configuration files]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/).
+
+## Add users, roles, role mappings, and tenants
+
+If you don't want to use the Security plugin, you can disable it by adding the following setting to the `opensearch.yml` file:
+
+```
+plugins.security.disabled: true
+```
+
+You can then enable the plugin by removing the `plugins.security.disabled` setting.
+
+For more information about disabling the Security plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+
+The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that contain "Kibana" in their names. We will change these names in a future version.
{: .note }
-For a full list of `opensearch.yml` Security plugin settings, Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
+For a full list of `opensearch.yml` Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
{: .note}
+
diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md
index 1686c8332e..2694e3a24f 100644
--- a/_security/configuration/yaml.md
+++ b/_security/configuration/yaml.md
@@ -265,7 +265,7 @@ kibana_server:
## roles.yml
-This file contains any initial roles that you want to add to the Security plugin. Aside from some metadata, the default file is empty, because the Security plugin has a number of static roles that it adds automatically.
+This file contains any initial roles that you want to add to the Security plugin. By default, this file contains predefined roles that grant usage to plugins within the default distribution of OpenSearch. The Security plugin will also add a number static roles automatically.
```yml
---
diff --git a/_tools/index.md b/_tools/index.md
index 108f10da97..c9d446a81a 100644
--- a/_tools/index.md
+++ b/_tools/index.md
@@ -18,6 +18,7 @@ This section provides documentation for OpenSearch-supported tools, including:
- [OpenSearch CLI](#opensearch-cli)
- [OpenSearch Kubernetes operator](#opensearch-kubernetes-operator)
- [OpenSearch upgrade, migration, and comparison tools](#opensearch-upgrade-migration-and-comparison-tools)
+- [Sycamore](#sycamore) for AI-powered extract, transform, load (ETL) on complex documents for vector and hybrid search
For information about Data Prepper, the server-side data collector for filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/index/).
@@ -122,3 +123,9 @@ The OpenSearch Kubernetes Operator is an open-source Kubernetes operator that he
OpenSearch migration tools facilitate migrations to OpenSearch and upgrades to newer versions of OpenSearch. These can help you can set up a proof-of-concept environment locally using Docker containers or deploy to AWS using a one-click deployment script. This empowers you to fine-tune cluster configurations and manage workloads more effectively before migration.
For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0).
+
+## Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using an [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+For more information, see [Sycamore]({{site.url}}{{site.baseurl}}/tools/sycamore/).
diff --git a/_tools/sycamore.md b/_tools/sycamore.md
new file mode 100644
index 0000000000..9b3986dbf3
--- /dev/null
+++ b/_tools/sycamore.md
@@ -0,0 +1,48 @@
+---
+layout: default
+title: Sycamore
+nav_order: 210
+has_children: false
+---
+
+# Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using a connector like the [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+To get started, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
+
+## Sycamore ETL pipeline structure
+
+A Sycamore extract, transform, load (ETL) pipeline applies a series of transformations to a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets), which is a collection of documents and their constituent elements (for example, tables, blocks of text, or headers). At the end of the pipeline, the DocSet is loaded into OpenSearch vector and keyword indexes.
+
+A typical pipeline for preparing unstructured data for vector or hybrid search in OpenSearch consists of the following steps:
+
+* Read documents into a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets).
+* [Partition documents](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/partition.html) into structured JSON elements.
+* Extract metadata and filter and clean data using [transforms](https://sycamore.readthedocs.io/en/stable/sycamore/APIs/docset.html).
+* Create [chunks](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/merge.html) from groups of elements.
+* Embed the chunks using the model of your choice.
+* [Load](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html) the embeddings, metadata, and text into OpenSearch vector and keyword indexes.
+
+For an example pipeline that uses this workflow, see [this notebook](https://github.com/aryn-ai/sycamore/blob/main/notebooks/opensearch_docs_etl.ipynb).
+
+
+## Install Sycamore
+
+We recommend installing the Sycamore library using `pip`. The connector for OpenSearch can be specified and installed using extras. For example:
+
+```bash
+pip install sycamore-ai[opensearch]
+```
+{% include copy.html %}
+
+By default, Sycamore works with the Aryn Partitioning Service to process PDFs. To run inference locally for partitioning or embedding, install Sycamore with the `local-inference` extra as follows:
+
+```bash
+pip install sycamore-ai[opensearch,local-inference]
+```
+{% include copy.html %}
+
+## Next steps
+
+For more information, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
diff --git a/_troubleshoot/tls.md b/_troubleshoot/tls.md
index 93e9a2c490..6c777ad5b8 100644
--- a/_troubleshoot/tls.md
+++ b/_troubleshoot/tls.md
@@ -207,7 +207,7 @@ plugins.security.ssl.http.enabled_protocols:
TLS relies on the server and client negotiating a common cipher suite. Depending on your system, the available ciphers will vary. They depend on the JDK or OpenSSL version you're using, and whether or not the `JCE Unlimited Strength Jurisdiction Policy Files` are installed.
-For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html). If you don't have them installed, you might see an error message on startup:
+For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/java/technologies/javase-jce8-downloads.html). If you don't have them installed, you might see an error message on startup:
```
[INFO ] AES-256 not supported, max key length for AES is 128 bit.
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
index d967aca914..03cd1716f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
@@ -67,10 +67,14 @@ The remote cluster state functionality has the following limitations:
## Remote cluster state publication
-
The cluster manager node processes updates to the cluster state. It then publishes the updated cluster state through the local transport layer to all of the follower nodes. With the `remote_store.publication` feature enabled, the cluster state is backed up to the remote store during every state update. The follower nodes can then fetch the state from the remote store directly, which reduces the overhead on the cluster manager node for publication.
-To enable the feature flag for the `remote_store.publication` feature, follow the steps in the [experimental feature flag documentation]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/).
+To enable this feature, configure the following setting in `opensearch.yml`:
+
+```yml
+# Enable Remote cluster state publication
+cluster.remote_store.publication.enabled: true
+```
Enabling the setting does not change the publication flow, and follower nodes will not send acknowledgements back to the cluster manager node
until they download the updated cluster state from the remote store.
@@ -89,8 +93,11 @@ You do not have to use different remote store repositories for state and routing
To configure remote publication, use the following cluster settings.
-Setting | Default | Description
-:--- | :--- | :---
-`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for remote state download to complete on the follower node.
-`cluster.remote_store.routing_table.path_type` | HASHED_PREFIX | The path type to be used for creating an index routing path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
-`cluster.remote_store.routing_table.path_hash_algo` | FNV_1A_BASE64 | The algorithm to be used for constructing the prefix or infix of the blob store path. This setting is applied if `cluster.remote_store.routing_table.path_type` is `hashed_prefix` or `hashed_infix`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+Setting | Default | Description
+:--- |:---| :---
+`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for the remote state download to complete on the follower node.
+`cluster.remote_store.state.path.prefix` | "" (Empty string) | The fixed prefix to add to the index metadata files in the blob store.
+`cluster.remote_store.index_metadata.path_type` | `HASHED_PREFIX` | The path type used for creating an index metadata path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
+`cluster.remote_store.index_metadata.path_hash_algo` | `FNV_1A_BASE64 ` | The algorithm that constructs the prefix or infix for the index metadata path in the blob store. This setting is applied if the ``cluster.remote_store.index_metadata.path_type` setting is `HASHED_PREFIX` or `HASHED_INFIX`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+`cluster.remote_store.routing_table.path.prefix` | "" (Empty string) | The fixed prefix to add for the index routing files in the blob store.
+
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
index 0415af65f1..e93f504be3 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
@@ -27,7 +27,7 @@ PUT /_snapshot/snap_repo
```
{% include copy-curl.html %}
-Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. After the setting is enabled, we recommend not disabling the setting. Doing so could affect data durability.
+Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. Therefore, do not disable the shallow snapshot setting after it has been enabled because disabling the setting could affect data durability.
## Considerations
@@ -37,3 +37,43 @@ Consider the following before using shallow copy snapshots:
- All nodes in the cluster must use OpenSearch 2.10 or later to take advantage of shallow copy snapshots.
- The `incremental` file count and size between the current snapshot and the last snapshot is `0` when using shallow copy snapshots.
- Searchable snapshots are not supported inside shallow copy snapshots.
+
+## Shallow snapshot v2
+
+Starting with OpenSearch 2.17, the shallow snapshot feature offers an improved version called `shallow snapshot v2`, which aims to makes snapshot operations more efficient and scalable by introducing the following enhancements:
+
+* Deterministic snapshot operations: Shallow snapshot v2 makes snapshot operations more deterministic, ensuring consistent and predictable behavior.
+* Minimized cluster state updates: Shallow snapshot v2 minimizes the number of cluster state updates required during snapshot operations, reducing overhead and improving performance.
+* Scalability: Shallow snapshot v2 allows snapshot operations to scale independently of the number of shards in the cluster, enabling better performance and efficiency for large datasets.
+
+Shallow snapshot v2 must be enabled separately from shallow copies.
+
+### Enabling shallow snapshot v2
+
+To enable shallow snapshot v2, enable the following repository settings:
+
+- `remote_store_index_shallow_copy: true`
+- `shallow_snapshot_v2: true`
+
+The following example request creates a shallow snapshot v2 repository:
+
+```bash
+PUT /_snapshot/snap_repo
+{
+"type": "s3",
+"settings": {
+"bucket": "test-bucket",
+"base_path": "daily-snaps",
+"remote_store_index_shallow_copy": true,
+"shallow_snapshot_v2": true
+}
+}
+```
+{% include copy-curl.html %}
+
+### Limitations
+
+Shallow snapshot v2 has the following limitations:
+
+* Shallow snapshot v2 only supported for remote-backed indexes.
+* All nodes in the cluster must use OpenSearch 2.17 or later to take advantage of shallow snapshot v2.
diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
index b9e35b2697..d13955f3f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
+++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
@@ -18,7 +18,7 @@ The searchable snapshot feature incorporates techniques like caching frequently
To configure the searchable snapshots feature, create a node in your `opensearch.yml file` and define the node role as `search`. Optionally, you can also configure the `cache.size` property for the node.
-A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
+A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
Parameter | Type | Description
:--- | :--- | :---
diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md
index 99db78565f..fa0973395f 100644
--- a/_tuning-your-cluster/index.md
+++ b/_tuning-your-cluster/index.md
@@ -192,11 +192,27 @@ To better understand and monitor your cluster, use the [CAT API]({{site.url}}{{s
## (Advanced) Step 6: Configure shard allocation awareness or forced awareness
+To further fine-tune your shard allocation, you can set custom node attributes for shard allocation awareness or forced awareness.
+
### Shard allocation awareness
-If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone that’s different from their primary shard.
+You can set custom node attributes on OpenSearch nodes to be used for shard allocation awareness. For example, you can set the `zone` attribute on each node to represent the zone in which the node is located. You can also use the `zone` attribute to ensure that the primary shard and its replica shards are allocated in a balanced manner across available, distinct zones. In this scenario, maximum shard copies per zone would equal `ceil (number_of_shard_copies/number_of_distinct_zones)`.
+
+OpenSearch, by default, allocates shard copies of a single shard across different nodes. When only 1 zone is available, such as after a zone failure, OpenSearch allocates replica shards to the only remaining zone---it considers only available zones (attribute values) when calculating the maximum number of allowed shard copies per zone.
+
+For example, if your index has a total of 5 shard copies (1 primary and 4 replicas) and nodes in 3 distinct zones, then OpenSearch will perform the following to allocate all 5 shard copies:
+
+- Allocate no more than 2 shards per zone, which will require at least 2 nodes in 2 zones.
+- Allocate the last shard in the third zone, with at least 1 node needed in the third zone.
-With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures.
+Alternatively, if you have 3 nodes in the first zone and 1 node in each remaining zone, then OpenSearch will allocate:
+
+- 2 shard copies in the first zone.
+- 1 shard copy in the remaining 2 zones.
+
+The final shard copy will remain unallocated due to the lack of nodes.
+
+With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones, adding a layer of fault tolerance to ensure that your data survives zone failures.
To configure shard allocation awareness, add zone attributes to `opensearch-d1` and `opensearch-d2`, respectively:
@@ -219,6 +235,8 @@ PUT _cluster/settings
}
```
+You can also use multiple attributes for shard allocation awareness by providing the attributes as a comma-separated string, for example, `zone,rack`.
+
You can either use `persistent` or `transient` settings. We recommend the `persistent` setting because it persists through a cluster reboot. Transient settings don't persist through a cluster reboot.
Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone.
diff --git a/_tuning-your-cluster/replication-plugin/auto-follow.md b/_tuning-your-cluster/replication-plugin/auto-follow.md
index 828b835387..92e7a6c144 100644
--- a/_tuning-your-cluster/replication-plugin/auto-follow.md
+++ b/_tuning-your-cluster/replication-plugin/auto-follow.md
@@ -98,9 +98,9 @@ To delete a replication rule, send the following request to the follower cluster
```bash
curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d '
{
- "leader_alias" : "my-conection-alias",
+ "leader_alias" : "my-connection-alias",
"name": "my-replication-rule"
}'
```
-When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
\ No newline at end of file
+When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
diff --git a/assets/examples/ecommerce.json b/assets/examples/ecommerce.ndjson
similarity index 100%
rename from assets/examples/ecommerce.json
rename to assets/examples/ecommerce.ndjson
diff --git a/assets/js/search.js b/assets/js/search.js
index 8d9cab2ec5..86970d9544 100644
--- a/assets/js/search.js
+++ b/assets/js/search.js
@@ -173,7 +173,10 @@
const showNoResults = () => {
emptyResults();
- elResults.appendChild(document.createRange().createContextualFragment('No results found!'));
+ const resultElement = document.createElement('div');
+ resultElement.classList.add('search-page--results--no-results');
+ resultElement.appendChild(document.createRange().createContextualFragment('No results found.'));
+ elResults.appendChild(resultElement);
showResults();
elSpinner?.classList.remove(CLASSNAME_SPINNING);
};
@@ -278,8 +281,6 @@
window.doResultsPageSearch = async (query, type, version) => {
- console.log("Running results page search!");
-
const searchResultsContainer = document.getElementById('searchPageResultsContainer');
try {
@@ -291,7 +292,7 @@ window.doResultsPageSearch = async (query, type, version) => {
if (data.results && data.results.length > 0) {
data.results.forEach(result => {
const resultElement = document.createElement('div');
- resultElement.classList.add('search-page--results--diplay--container--item');
+ resultElement.classList.add('search-page--results--display--container--item');
const contentCite = document.createElement('cite');
const crumbs = [...result.ancestors];
@@ -302,11 +303,9 @@ window.doResultsPageSearch = async (query, type, version) => {
const titleLink = document.createElement('a');
titleLink.href = result.url;
+ titleLink.classList.add('search-page--results--display--container--item--link');
titleLink.textContent = result.title;
- titleLink.style.fontSize = '1.5em';
- titleLink.style.fontWeight = 'bold';
- titleLink.style.display = 'block';
-
+
const contentSpan = document.createElement('span');
contentSpan.textContent = result.content;
contentSpan.style.display = 'block';
@@ -317,16 +316,10 @@ window.doResultsPageSearch = async (query, type, version) => {
// Append the result element to the searchResultsContainer
searchResultsContainer.appendChild(resultElement);
-
- const breakline = document.createElement('hr');
- breakline.style.border = '.5px solid #ccc';
- breakline.style.margin = 'auto';
- searchResultsContainer.appendChild(breakline);
});
} else {
const noResultsElement = document.createElement('div');
noResultsElement.textContent = 'No results found.';
- noResultsElement.style.fontSize = '2em';
searchResultsContainer.appendChild(noResultsElement);
}
} catch (error) {
diff --git a/build.sh b/build.sh
index 060bbfa666..85ef617931 100755
--- a/build.sh
+++ b/build.sh
@@ -1,3 +1,9 @@
#!/usr/bin/env bash
-JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host localhost --port 4000 --incremental --livereload --open-url --trace
+host="localhost"
+
+if [[ "$DOCKER_BUILD" == "true" ]]; then
+ host="0.0.0.0"
+fi
+
+JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host ${host} --port 4000 --incremental --livereload --open-url --trace
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000000..04dd007db9
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,14 @@
+version: "3"
+
+services:
+ doc_builder:
+ image: ruby:3.2.4
+ volumes:
+ - .:/app
+ working_dir: /app
+ ports:
+ - "4000:4000"
+ command: bash -c "bundler install && bash build.sh"
+ environment:
+ BUNDLE_PATH: /app/vendor/bundle # Avoid installing gems globally.
+ DOCKER_BUILD: true # Signify build.sh to bind to 0.0.0.0 for effective doc access from host.
diff --git a/release-notes/opensearch-documentation-release-notes-2.17.0.md b/release-notes/opensearch-documentation-release-notes-2.17.0.md
new file mode 100644
index 0000000000..d9ed51737c
--- /dev/null
+++ b/release-notes/opensearch-documentation-release-notes-2.17.0.md
@@ -0,0 +1,36 @@
+# OpenSearch Documentation Website 2.17.0 Release Notes
+
+The OpenSearch 2.17.0 documentation includes the following additions and updates.
+
+## New documentation for 2.17.0
+
+- Get offline batch inference details using task API in m [#8305](https://github.com/opensearch-project/documentation-website/pull/8305)
+- Documentation for Binary Quantization Support with KNN Vector Search [#8281](https://github.com/opensearch-project/documentation-website/pull/8281)
+- add offline batch ingestion tech doc [#8251](https://github.com/opensearch-project/documentation-website/pull/8251)
+- Add documentation changes for disk-based k-NN [#8246](https://github.com/opensearch-project/documentation-website/pull/8246)
+- Derived field updates for 2.17 [#8244](https://github.com/opensearch-project/documentation-website/pull/8244)
+- Add changes for multiple signing keys [#8243](https://github.com/opensearch-project/documentation-website/pull/8243)
+- Add documentation changes for Snapshot Status API [#8235](https://github.com/opensearch-project/documentation-website/pull/8235)
+- Update flow framework additional fields in previous_node_inputs [#8233](https://github.com/opensearch-project/documentation-website/pull/8233)
+- Add documentation changes for shallow snapshot v2 [#8207](https://github.com/opensearch-project/documentation-website/pull/8207)
+- Add documentation for context and ABC templates [#8197](https://github.com/opensearch-project/documentation-website/pull/8197)
+- Create documentation for snapshots with hashed prefix path type [#8196](https://github.com/opensearch-project/documentation-website/pull/8196)
+- Adding documentation for remote index use in AD [#8191](https://github.com/opensearch-project/documentation-website/pull/8191)
+- Doc update for concurrent search [#8181](https://github.com/opensearch-project/documentation-website/pull/8181)
+- Adding new cluster search setting docs [#8180](https://github.com/opensearch-project/documentation-website/pull/8180)
+- Add new settings for remote publication [#8176](https://github.com/opensearch-project/documentation-website/pull/8176)
+- Grouping Top N queries documentation [#8173](https://github.com/opensearch-project/documentation-website/pull/8173)
+- Document reprovision param for Update Workflow API [#8172](https://github.com/opensearch-project/documentation-website/pull/8172)
+- Add documentation for Faiss byte vector [#8170](https://github.com/opensearch-project/documentation-website/pull/8170)
+- Terms query can accept encoded terms input as bitmap [#8133](https://github.com/opensearch-project/documentation-website/pull/8133)
+- Update doc for adding new param in cat shards action for cancellation… [#8127](https://github.com/opensearch-project/documentation-website/pull/8127)
+- Add docs on skip_validating_missing_parameters in ml-commons connector [#8118](https://github.com/opensearch-project/documentation-website/pull/8118)
+- Add Split Response Processor to 2.17 Search Pipeline docs [#8081](https://github.com/opensearch-project/documentation-website/pull/8081)
+- Added documentation for FGAC for Flow Framework [#8076](https://github.com/opensearch-project/documentation-website/pull/8076)
+- Remove composite agg limitations for concurrent search [#7904](https://github.com/opensearch-project/documentation-website/pull/7904)
+- Add doc for nodes stats search.request.took fields [#7887](https://github.com/opensearch-project/documentation-website/pull/7887)
+- Add documentation for ignore_hosts config option for ip-based rate limiting [#7859](https://github.com/opensearch-project/documentation-website/pull/7859)
+
+## Documentation for 2.17.0 experimental features
+
+- Document new experimental ingestion streaming APIs [#8123](https://github.com/opensearch-project/documentation-website/pull/8123)
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
- categoryNews.addEventListener('change', () => {
+ categoryBlog.addEventListener('change', () => {
+ updateAllCheckbox();
+ triggerSearch(searchInput.value.trim());
+ });
+ categoryEvent.addEventListener('change', () => {
updateAllCheckbox();
triggerSearch(searchInput.value.trim());
});
diff --git a/_ml-commons-plugin/api/async-batch-ingest.md b/_ml-commons-plugin/api/async-batch-ingest.md
new file mode 100644
index 0000000000..ace95ba4d4
--- /dev/null
+++ b/_ml-commons-plugin/api/async-batch-ingest.md
@@ -0,0 +1,97 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+parent: ML Commons APIs
+has_children: false
+has_toc: false
+nav_order: 35
+---
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+Use the Asynchronous Batch Ingestion API to ingest data into your OpenSearch cluster from your files on remote file servers, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. For detailed configuration steps, see [Asynchronous batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/async-batch-ingestion/).
+
+## Path and HTTP methods
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+```
+
+#### Request fields
+
+The following table lists the available request fields.
+
+Field | Data type | Required/Optional | Description
+:--- | :--- | :---
+`index_name`| String | Required | The index name.
+`field_map` | Object | Required | Maps fields from the source file to specific fields in an OpenSearch index for ingestion.
+`ingest_fields` | Array | Optional | Lists fields from the source file that should be ingested directly into the OpenSearch index without any additional mapping.
+`credential` | Object | Required | Contains the authentication information for accessing external data sources, such as Amazon S3 or OpenAI.
+`data_source` | Object | Required | Specifies the type and location of the external file(s) from which the data is ingested.
+`data_source.type` | String | Required | Specifies the type of the external data source. Valid values are `s3` and `openAI`.
+`data_source.source` | Array | Required | Specifies one or more file locations from which the data is ingested. For `s3`, specify the file path to the Amazon S3 bucket (for example, `["s3://offlinebatch/output/sagemaker_batch.json.out"]`). For `openAI`, specify the file IDs for input or output files (for example, `["file-", "file-", "file-"]`).
+
+## Example request: Ingesting a single file
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example request: Ingesting multiple files
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+## Example response
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
diff --git a/_ml-commons-plugin/api/connector-apis/update-connector.md b/_ml-commons-plugin/api/connector-apis/update-connector.md
index 64790bb57f..625d58bb62 100644
--- a/_ml-commons-plugin/api/connector-apis/update-connector.md
+++ b/_ml-commons-plugin/api/connector-apis/update-connector.md
@@ -29,17 +29,20 @@ PUT /_plugins/_ml/connectors/
The following table lists the updatable fields. For more information about all connector fields, see [Blueprint configuration parameters]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints#configuration-parameters).
-| Field | Data type | Description |
-| :--- | :--- | :--- |
-| `name` | String | The name of the connector. |
-| `description` | String | A description of the connector. |
-| `version` | Integer | The version of the connector. |
-| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
+| Field | Data type | Description |
+| :--- |:------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | The name of the connector. |
+| `description` | String | A description of the connector. |
+| `version` | Integer | The connector version. |
+| `protocol` | String | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | The default connector parameters, including `endpoint` and `model`. Any parameters included in this field can be overridden by parameters specified in a predict request. |
| `credential` | JSON object | Defines any credential variables required in order to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `actions` | JSON array | Defines which actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `parameters.skip_validating_missing_parameters` | Boolean | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
+
+
#### Example request
diff --git a/_ml-commons-plugin/api/execute-algorithm.md b/_ml-commons-plugin/api/execute-algorithm.md
index 7b06cfefe8..6acd926444 100644
--- a/_ml-commons-plugin/api/execute-algorithm.md
+++ b/_ml-commons-plugin/api/execute-algorithm.md
@@ -2,7 +2,7 @@
layout: default
title: Execute algorithm
parent: ML Commons APIs
-nav_order: 30
+nav_order: 37
---
# Execute algorithm
diff --git a/_ml-commons-plugin/api/model-apis/batch-predict.md b/_ml-commons-plugin/api/model-apis/batch-predict.md
index b32fbb108d..c1dc7348fe 100644
--- a/_ml-commons-plugin/api/model-apis/batch-predict.md
+++ b/_ml-commons-plugin/api/model-apis/batch-predict.md
@@ -31,7 +31,13 @@ POST /_plugins/_ml/models//_batch_predict
## Prerequisites
-Before using the Batch Predict API, you need to create a connector to the externally hosted model. For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request:
+Before using the Batch Predict API, you need to create a connector to the externally hosted model. For each action, specify the `action_type` parameter that describes the action:
+
+- `batch_predict`: Runs the batch predict operation.
+- `batch_predict_status`: Checks the batch predict operation status.
+- `cancel_batch_predict`: Cancels the batch predict operation.
+
+For example, to create a connector to an OpenAI `text-embedding-ada-002` model, send the following request. The `cancel_batch_predict` action is optional and supports canceling the batch job running on OpenAI:
```json
POST /_plugins/_ml/connectors/_create
@@ -68,6 +74,22 @@ POST /_plugins/_ml/connectors/_create
"Authorization": "Bearer ${credential.openAI_key}"
},
"request_body": "{ \"input_file_id\": \"${parameters.input_file_id}\", \"endpoint\": \"${parameters.endpoint}\", \"completion_window\": \"24h\" }"
+ },
+ {
+ "action_type": "batch_predict_status",
+ "method": "GET",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
+ },
+ {
+ "action_type": "cancel_batch_predict",
+ "method": "POST",
+ "url": "https://api.openai.com/v1/batches/${parameters.id}/cancel",
+ "headers": {
+ "Authorization": "Bearer ${credential.openAI_key}"
+ }
}
]
}
@@ -123,45 +145,87 @@ POST /_plugins/_ml/models/lyjxwZABNrAVdFa9zrcZ/_batch_predict
#### Example response
+The response contains the task ID for the batch predict operation:
+
```json
{
- "inference_results": [
- {
- "output": [
- {
- "name": "response",
- "dataAsMap": {
- "id": "batch_",
- "object": "batch",
- "endpoint": "/v1/embeddings",
- "errors": null,
- "input_file_id": "file-",
- "completion_window": "24h",
- "status": "validating",
- "output_file_id": null,
- "error_file_id": null,
- "created_at": 1722037257,
- "in_progress_at": null,
- "expires_at": 1722123657,
- "finalizing_at": null,
- "completed_at": null,
- "failed_at": null,
- "expired_at": null,
- "cancelling_at": null,
- "cancelled_at": null,
- "request_counts": {
- "total": 0,
- "completed": 0,
- "failed": 0
- },
- "metadata": null
- }
- }
- ],
- "status_code": 200
- }
- ]
+ "task_id": "KYZSv5EBqL2d0mFvs80C",
+ "status": "CREATED"
}
```
-For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
\ No newline at end of file
+To check the status of the batch predict job, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). You can find the job details in the `remote_job` field in the task. Once the prediction is complete, the task `state` changes to `COMPLETED`.
+
+#### Example request
+
+```json
+GET /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+The response contains the batch predict operation details in the `remote_job` field:
+
+```json
+{
+ "model_id": "JYZRv5EBqL2d0mFvKs1E",
+ "task_type": "BATCH_PREDICTION",
+ "function_name": "REMOTE",
+ "state": "RUNNING",
+ "input_type": "REMOTE",
+ "worker_node": [
+ "Ee5OCIq0RAy05hqQsNI1rg"
+ ],
+ "create_time": 1725491751455,
+ "last_update_time": 1725491751455,
+ "is_async": false,
+ "remote_job": {
+ "cancelled_at": null,
+ "metadata": null,
+ "request_counts": {
+ "total": 3,
+ "completed": 3,
+ "failed": 0
+ },
+ "input_file_id": "file-XXXXXXXXXXXX",
+ "output_file_id": "file-XXXXXXXXXXXXX",
+ "error_file_id": null,
+ "created_at": 1725491753,
+ "in_progress_at": 1725491753,
+ "expired_at": null,
+ "finalizing_at": 1725491757,
+ "completed_at": null,
+ "endpoint": "/v1/embeddings",
+ "expires_at": 1725578153,
+ "cancelling_at": null,
+ "completion_window": "24h",
+ "id": "batch_XXXXXXXXXXXXXXX",
+ "failed_at": null,
+ "errors": null,
+ "object": "batch",
+ "status": "in_progress"
+ }
+}
+```
+
+For the definition of each field in the result, see [OpenAI Batch API](https://platform.openai.com/docs/guides/batch). Once the batch inference is complete, you can download the output by calling the [OpenAI Files API](https://platform.openai.com/docs/api-reference/files) and providing the file name specified in the `id` field of the response.
+
+### Canceling a batch predict job
+
+You can also cancel the batch predict operation running on the remote platform using the task ID returned by the batch predict request. To add this capability, set the `action_type` to `cancel_batch_predict` in the connector configuration when creating the connector.
+
+#### Example request
+
+```json
+POST /_plugins/_ml/tasks/KYZSv5EBqL2d0mFvs80C/_cancel_batch
+```
+{% include copy-curl.html %}
+
+#### Example response
+
+```json
+{
+ "status": "OK"
+}
+```
diff --git a/_ml-commons-plugin/remote-models/async-batch-ingestion.md b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
new file mode 100644
index 0000000000..a09c028477
--- /dev/null
+++ b/_ml-commons-plugin/remote-models/async-batch-ingestion.md
@@ -0,0 +1,190 @@
+---
+layout: default
+title: Asynchronous batch ingestion
+nav_order: 90
+parent: Connecting to externally hosted models
+grand_parent: Integrating ML models
+---
+
+
+# Asynchronous batch ingestion
+**Introduced 2.17**
+{: .label .label-purple }
+
+[Batch ingestion]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/batch-ingestion/) configures an ingest pipeline, which processes documents one by one. For each document, batch ingestion calls an externally hosted model to generate text embeddings from the document text and then ingests the document, including text and embeddings, into an OpenSearch index.
+
+An alternative to this real-time process, _asynchronous_ batch ingestion, ingests both documents and their embeddings generated outside of OpenSearch and stored on a remote file server, such as Amazon Simple Storage Service (Amazon S3) or OpenAI. Asynchronous ingestion returns a task ID and runs asynchronously to ingest data offline into your k-NN cluster for neural search. You can use asynchronous batch ingestion together with the [Batch Predict API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/model-apis/batch-predict/) to perform inference asynchronously. The batch predict operation takes an input file containing documents and calls an externally hosted model to generate embeddings for those documents in an output file. You can then use asynchronous batch ingestion to ingest both the input file containing documents and the output file containing their embeddings into an OpenSearch index.
+
+As of OpenSearch 2.17, the Asynchronous Batch Ingestion API is supported by Amazon SageMaker, Amazon Bedrock, and OpenAI.
+{: .note}
+
+## Prerequisites
+
+Before using asynchronous batch ingestion, you must generate text embeddings using a model of your choice and store the output on a file server, such as Amazon S3. For example, you can store the output of a Batch API call to an Amazon SageMaker text embedding model in a file with the Amazon S3 output path `s3://offlinebatch/output/sagemaker_batch.json.out`. The output is in JSONL format, with each line representing a text embedding result. The file contents have the following format:
+
+```
+{"SageMakerOutput":[[-0.017166402,0.055771016,...],[-0.06422759,-0.004301484,...],"content":["this is chapter 1","harry potter"],"id":1}
+{"SageMakerOutput":[[-0.017455402,0.023771016,...],[-0.02322759,-0.009101284,...],"content":["this is chapter 2","draco malfoy"],"id":1}
+...
+```
+
+## Ingesting data from a single file
+
+First, create a k-NN index into which you'll ingest the data. The fields in the k-NN index represent the structure of the data in the source file.
+
+In this example, the source file holds documents containing titles and chapters, along with their corresponding embeddings. Thus, you'll create a k-NN index with the fields `id`, `chapter_embedding`, `chapter`, `title_embedding`, and `title`:
+
+```json
+PUT /my-nlp-index
+{
+ "settings": {
+ "index.knn": true
+ },
+ "mappings": {
+ "properties": {
+ "id": {
+ "type": "text"
+ },
+ "chapter_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "chapter": {
+ "type": "text"
+ },
+ "title_embedding": {
+ "type": "knn_vector",
+ "dimension": 384,
+ "method": {
+ "engine": "nmslib",
+ "space_type": "cosinesimil",
+ "name": "hnsw",
+ "parameters": {
+ "ef_construction": 512,
+ "m": 16
+ }
+ }
+ },
+ "title": {
+ "type": "text"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+When using an S3 file as the source for asynchronous batch ingestion, you must map the fields in the source file to fields in the index in order to indicate into which index each piece of data is ingested. If no JSON path is provided for a field, that field will be set to `null` in the k-NN index.
+
+In the `field_map`, indicate the location of the data for each field in the source file. You can also specify fields to be ingested directly into your index without making any changes to the source file by adding their JSON paths to the `ingest_fields` array. For example, in the following asynchronous batch ingestion request, the element with the JSON path `$.id` from the source file is ingested directly into the `id` field of your index. To ingest this data from the Amazon S3 file, send the following request to your OpenSearch endpoint:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index",
+ "field_map": {
+ "chapter": "$.content[0]",
+ "title": "$.content[1]",
+ "chapter_embedding": "$.SageMakerOutput[0]",
+ "title_embedding": "$.SageMakerOutput[1]",
+ "_id": "$.id"
+ },
+ "ingest_fields": ["$.id"],
+ "credential": {
+ "region": "us-east-1",
+ "access_key": "",
+ "secret_key": "",
+ "session_token": ""
+ },
+ "data_source": {
+ "type": "s3",
+ "source": ["s3://offlinebatch/output/sagemaker_batch.json.out"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+
+## Ingesting data from multiple files
+
+You can also ingest data from multiple files by specifying the file locations in the `source`. The following example ingests data from three OpenAI files.
+
+The OpenAI Batch API input file is formatted as follows:
+
+```
+{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of life?", "The food was delicious and the waiter..."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of work?", "The travel was fantastic and the view..."]}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "text-embedding-ada-002", "input": [ "What is the meaning of friend?", "The old friend was far away and the time..."]}}
+...
+```
+
+The OpenAI Batch API output file is formatted as follows:
+
+```
+{"id": "batch_req_ITKQn29igorXCAGp6wzYs5IS", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "10845755592510080d13054c3776aef4", "body": {"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0044326545, ... ...]}, {"object": "embedding", "index": 1, "embedding": [0.002297497, ... ... ]}], "model": "text-embedding-ada-002", "usage": {"prompt_tokens": 15, "total_tokens": 15}}}, "error": null}
+...
+```
+
+If you have run the Batch API in OpenAI for text embedding and want to ingest the model input and output files along with some metadata into your index, send the following asynchronous ingestion request. Make sure to use `source[file-index]` to identify the file's location in the source array in the request body. For example, `source[0]` refers to the first file in the `data_source.source` array.
+
+The following request ingests seven fields into your index: Five are specified in the `field_map` section and two are specified in `ingest_fields`. The format follows the pattern `sourcefile.jsonPath`, indicating the JSON path for each file. In the field_map, `$.body.input[0]` is used as the JSON path to ingest data into the `question` field from the second file in the `source` array. The `ingest_fields` array lists all elements from the `source` files that will be ingested directly into your index:
+
+```json
+POST /_plugins/_ml/_batch_ingestion
+{
+ "index_name": "my-nlp-index-openai",
+ "field_map": {
+ "question": "source[1].$.body.input[0]",
+ "answer": "source[1].$.body.input[1]",
+ "question_embedding":"source[0].$.response.body.data[0].embedding",
+ "answer_embedding":"source[0].$.response.body.data[1].embedding",
+ "_id": ["source[0].$.custom_id", "source[1].$.custom_id"]
+ },
+ "ingest_fields": ["source[2].$.custom_field1", "source[2].$.custom_field2"],
+ "credential": {
+ "openAI_key": ""
+ },
+ "data_source": {
+ "type": "openAI",
+ "source": ["file-", "file-", "file-"]
+ }
+}
+```
+{% include copy-curl.html %}
+
+In the request, make sure to define the `_id` field in the `field_map`. This is necessary in order to map each data entry from the three separate files.
+
+The response contains a task ID for the ingestion task:
+
+```json
+{
+ "task_id": "cbsPlpEBMHcagzGbOQOx",
+ "task_type": "BATCH_INGEST",
+ "status": "CREATED"
+}
+```
+
+To check the status of the operation, provide the task ID to the [Tasks API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/tasks-apis/get-task/). Once ingestion is complete, the task `state` changes to `COMPLETED`.
+
+For request field descriptions, see [Asynchronous Batch Ingestion API]({{site.url}}{{site.baseurl}}/ml-commons-plugin/api/async-batch-ingest/).
\ No newline at end of file
diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md
index 254a21b068..9b95c31166 100644
--- a/_ml-commons-plugin/remote-models/blueprints.md
+++ b/_ml-commons-plugin/remote-models/blueprints.md
@@ -55,19 +55,20 @@ As an ML developer, you can build connector blueprints for other platforms. Usin
## Configuration parameters
-| Field | Data type | Is required | Description |
-|:---|:---|:---|:---|
-| `name` | String | Yes | The name of the connector. |
-| `description` | String | Yes | A description of the connector. |
-| `version` | Integer | Yes | The version of the connector. |
-| `protocol` | String | Yes | The protocol for the connection. For AWS services such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
-| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint` and `model`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
-| `credential` | JSON object | Yes | Defines any credential variables required to connect to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the connection to the cluster first starts, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
-| `actions` | JSON array | Yes | Defines what actions can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
-| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
-| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
-| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
-| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| Field | Data type | Is required | Description |
+|:-------------------------------------------------|:---|:------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `name` | String | Yes | The name of the connector. |
+| `description` | String | Yes | A description of the connector. |
+| `version` | Integer | Yes | The connector version. |
+| `protocol` | String | Yes | The protocol for the connection. For AWS services, such as Amazon SageMaker and Amazon Bedrock, use `aws_sigv4`. For all other services, use `http`. |
+| `parameters` | JSON object | Yes | The default connector parameters, including `endpoint`, `model`, and `skip_validating_missing_parameters`. Any parameters indicated in this field can be overridden by parameters specified in a predict request. |
+| `credential` | JSON object | Yes | Defines any credential variables required for connecting to your chosen endpoint. ML Commons uses **AES/GCM/NoPadding** symmetric encryption to encrypt your credentials. When the cluster connection is initiated, OpenSearch creates a random 32-byte encryption key that persists in OpenSearch's system index. Therefore, you do not need to manually set the encryption key. |
+| `actions` | JSON array | Yes | Defines the actions that can run within the connector. If you're an administrator creating a connection, add the [blueprint]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/blueprints/) for your desired connection. |
+| `backend_roles` | JSON array | Yes | A list of OpenSearch backend roles. For more information about setting up backend roles, see [Assigning backend roles to users]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#assigning-backend-roles-to-users). |
+| `access_mode` | String | Yes | Sets the access mode for the model, either `public`, `restricted`, or `private`. Default is `private`. For more information about `access_mode`, see [Model groups]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-access-control#model-groups). |
+| `add_all_backend_roles` | Boolean | Yes | When set to `true`, adds all `backend_roles` to the access list, which only a user with admin permissions can adjust. When set to `false`, non-admins can add `backend_roles`. |
+| `client_config` | JSON object | No | The client configuration object, which provides settings that control the behavior of the client connections used by the connector. These settings allow you to manage connection limits and timeouts, ensuring efficient and reliable communication. |
+| `parameters.skip_validating_missing_parameters` | Boolean | No | When set to `true`, this option allows you to send a request using a connector without validating any missing parameters. Default is `false`. |
The `actions` parameter supports the following options.
@@ -76,12 +77,11 @@ The `actions` parameter supports the following options.
|:---|:---|:---|
| `action_type` | String | Required. Sets the ML Commons API operation to use upon connection. As of OpenSearch 2.9, only `predict` is supported. |
| `method` | String | Required. Defines the HTTP method for the API call. Supports `POST` and `GET`. |
-| `url` | String | Required. Sets the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints). |
-| `headers` | JSON object | Sets the headers used inside the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
+| `url` | String | Required. Specifies the connection endpoint at which the action occurs. This must match the regex expression for the connection used when [adding trusted endpoints]({{site.url}}{{site.baseurl}}/ml-commons-plugin/remote-models/index#adding-trusted-endpoints).|
| `request_body` | String | Required. Sets the parameters contained in the request body of the action. The parameters must include `\"inputText\`, which specifies how users of the connector should construct the request payload for the `action_type`. |
| `pre_process_function` | String | Optional. A built-in or custom Painless script used to preprocess the input data. OpenSearch provides the following built-in preprocess functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://platform.openai.com/docs/guides/embeddings) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
| `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). |
-
+| `headers` | JSON object | Specifies the headers used in the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. |
The `client_config` parameter supports the following options.
diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
index 7061d3cb5a..c4cc27f660 100644
--- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
+++ b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md
@@ -7,7 +7,7 @@ nav_order: 10
# Semantic search using byte-quantized vectors
-This tutorial illustrates how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector).
+This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors).
The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors.
diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md
index bb4f9c6c30..156e985e8b 100644
--- a/_monitoring-your-cluster/pa/index.md
+++ b/_monitoring-your-cluster/pa/index.md
@@ -60,7 +60,7 @@ private-key-file-path = specify_path
The Performance Analyzer plugin is included in the installations for [Docker]({{site.url}}{{site.baseurl}}/opensearch/install/docker/) and [tarball]({{site.url}}{{site.baseurl}}/opensearch/install/tar/), but you can also install the plugin manually.
-To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://search.maven.org/search?q=org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
+To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://central.sonatype.com/namespace/org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster.
To start the Performance Analyzer root cause analysis (RCA) agent on a tarball installation, run the following command:
diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md
index 679237094a..ad6fa5950b 100644
--- a/_observing-your-data/ad/dashboards-anomaly-detection.md
+++ b/_observing-your-data/ad/dashboards-anomaly-detection.md
@@ -18,12 +18,12 @@ You can connect data visualizations to OpenSearch datasets and then create, run,
Before getting started, you must have:
- Installed OpenSearch and OpenSearch Dashboards version 2.9 or later. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/).
-- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins).
+- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]/({{site.url}}{{site.baseurl}}/install-and-configure/plugins/).
- Installed the Anomaly Detection Dashboards plugin version 2.9 or later. See [Managing OpenSearch Dashboards plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/) to get started.
## General requirements for anomaly detection visualizations
-Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information on real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-set-up-detector-jobs).
+Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information about real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-setting-up-detector-jobs).
Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization:
diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md
index 5dfa1b8f1a..657c3c90cb 100644
--- a/_observing-your-data/ad/index.md
+++ b/_observing-your-data/ad/index.md
@@ -10,30 +10,42 @@ redirect_from:
# Anomaly detection
-An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure.
+An _anomaly_ in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric can help identify early signs of a system failure.
-It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior.
+Conventional techniques like visualizations and dashboards can make it difficult to uncover anomalies. Configuring alerts based on static thresholds is possible, but this approach requires prior domain knowledge and may not adapt to data with organic growth or seasonal trends.
-Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
+Anomaly detection automatically detects anomalies in your OpenSearch data in near real time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an _anomaly grade_ and _confidence score_ value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Robust Random Cut Forest Based Anomaly Detection on Streams](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9).
You can pair the Anomaly Detection plugin with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to notify you as soon as an anomaly is detected.
+{: .note}
-To get started, choose **Anomaly Detection** in OpenSearch Dashboards.
-To first test with sample streaming data, you can try out one of the preconfigured detectors with one of the sample datasets.
+## Getting started with anomaly detection in OpenSearch Dashboards
+
+To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Anomaly Detection**.
## Step 1: Define a detector
-A detector is an individual anomaly detection task. You can define multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources.
+A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps:
+
+1. On the **Anomaly detection** page, select the **Create detector** button.
+2. On the **Define detector** page, enter the required information in the **Detector details** pane.
+3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias.
+4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
+#### Example: Filtering data using query DSL
+
+The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values:
+=======
1. Choose **Create detector**.
1. Add in the detector details.
- Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector.
1. Specify the data source.
- - For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indexes.
+ - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes.
+ - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation.
- (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL).
-#### Example filter using query DSL
-The query is designed to retrieve documents in which the `urlPath.keyword` field matches one of the following specified values:
+To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`.
+{: .note}
- /domain/{id}/short
- /sub_dir/{id}/short
@@ -62,40 +74,38 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
}
}
```
+ {% include copy-curl.html %}
-1. Specify a timestamp.
- - Select the **Timestamp field** in your index.
-1. Define operation settings.
- - For **Operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data.
- - The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model.
- The shorter you set this interval, the fewer data points the detector aggregates.
- The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals.
-
- - We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process.
+5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu.
- - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
+6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data.
+ - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals.
+ - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process.
+ - (Optional) To add extra processing time for data collection, specify a **Window delay** value.
- This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay.
- - For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
-1. Specify custom results index.
- - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. To enable this, select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, like `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+ - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time.
+ - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection.
- You can use the dash “-” sign to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the "financial" department at a granular level for the "us" area.
+7. Specify a custom results index.
+ - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored.
+
+ You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group.
{: .note }
- When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards.
- If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis.
- If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file.
- - To use the custom results index option, you need the following permissions:
- - `indices:admin/create` - The Anomaly Detection plugin requires the ability to create and roll over the custom index.
- - `indices:admin/aliases` - The Anomaly Detection plugin requires access to create and manage an alias for the custom index.
- - `indices:data/write/index` - You need the `write` permission for the Anomaly Detection plugin to write results into the custom index for a single-entity detector.
- - `indices:data/read/search` - You need the `search` permission because the Anomaly Detection plugin needs to search custom results indexes to show results on the Anomaly Detection UI.
- - `indices:data/write/delete` - Because the detector might generate a large number of anomaly results, you need the `delete` permission to delete old data and save disk space.
- - `indices:data/write/bulk*` - You need the `bulk*` permission because the Anomaly Detection plugin uses the bulk API to write results into the custom index.
- - Managing the custom results index:
- - The anomaly detection dashboard queries all detectors’ results from all custom results indexes. Having too many custom results indexes might impact the performance of the Anomaly Detection plugin.
- - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to rollover old results indexes. You can also manually delete or archive any old results indexes. We recommend reusing a custom results index for multiple detectors.
- - The Anomaly Detection plugin also provides lifecycle management for custom indexes. It rolls an alias over to a new index when the custom results index meets any of the conditions in the following table.
+ - To use the custom results index option, you must have the following permissions:
+ - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index.
+ - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index.
+ - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector.
+ - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface.
+ - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space.
+ - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index.
+ - When managing the custom results index, consider the following:
+ - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance.
+ - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended.
+ - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table.
Parameter | Description | Type | Unit | Example | Required
:--- | :--- |:--- |:--- |:--- |:---
@@ -103,43 +113,52 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field
`result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No
`result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No
-1. Choose **Next**.
+8. Choose **Next**.
After you define the detector, the next step is to configure the model.
## Step 2: Configure the model
-#### Add features to your detector
+1. Add features to your detector.
-A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
+A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature.
-A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting.
-{: .note }
+A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting.
+{: .note}
+
+### Configuring a model based on an aggregation method
To configure an anomaly detection model based on an aggregation method, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Field Value**.
-1. For **aggregation method**, select either **average()**, **count()**, **sum()**, **min()**, or **max()**.
-1. For **Field**, select from the available options.
+1. On the **Detectors** page, select the desired detector from the list.
+2. On the detector's details page, select the **Actions** button to activate the dropdown menu and then select **Edit model configuration**.
+3. On the **Edit model configuration** page, select the **Add another feature** button.
+4. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+5. Select **Field value** from the dropdown menu under **Find anomalies based on**.
+6. Select the desired aggregation from the dropdown menu under **Aggregation method**.
+7. Select the desired field from the options listed in the dropdown menu under **Field**.
+8. Select the **Save changes** button.
+
+### Configuring a model based on a JSON aggregation query
To configure an anomaly detection model based on a JSON aggregation query, follow these steps:
-1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**.
-1. For **Find anomalies based on**, select **Custom expression**. You will see the JSON editor window open up.
-1. Enter your JSON aggregation query in the editor.
-For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/)
-{: .note }
+1. On the **Edit model configuration** page, select the **Add another feature** button.
+2. Enter a name in the **Feature name** field and select the **Enable feature** checkbox.
+3. Select **Custom expression** from the dropdown menu under **Find anomalies based on**. The JSON editor window will open.
+4. Enter your JSON aggregation query in the editor.
+5. Select the **Save changes** button.
-#### (Optional) Set category fields for high cardinality
+For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/).
+{: .note}
-You can categorize anomalies based on a keyword or IP field type.
+### Setting categorical fields for high cardinality
-The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues.
+You can categorize anomalies based on a keyword or IP field type. You can enable the **Categorical fields** option to categorize, or "slice," the source time series using a dimension, such as an IP address, a product ID, or a country code. This gives you a granular view of anomalies within each entity of the category field to help isolate and debug issues.
-To set a category field, choose **Enable a category field** and select a field. You can’t change the category fields after you create the detector.
+To set a category field, choose **Enable categorical fields** and select a field. You cannot change the category fields after you create the detector.
Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster:
@@ -147,7 +166,7 @@ Only a certain number of unique entities are supported in the category field. Us
(data nodes * heap size * anomaly detection maximum memory percentage) / (entity model size of a detector)
```
-To get the entity model size of a detector, use the [profile detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage with the `plugins.anomaly_detection.model_max_size_percent` setting.
+To get the detector's entity model size, use the [Profile Detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage using the `plugins.anomaly_detection.model_max_size_percent` setting.
Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the default 10% memory allocation. With an entity model size of 1 MB, the following formula calculates the estimated number of unique entities:
@@ -155,81 +174,109 @@ Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the de
(8096 MB * 0.1 / 1 MB ) * 3 = 2429
```
-If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), the anomaly detector will attempt to model the extra entities. The detector prioritizes entities that occur more often and are more recent.
+If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), then the anomaly detector attempts to model the extra entities. The detector prioritizes both entities that occur more often and are more recent.
-This formula serves as a starting point. Make sure to test it with a representative workload. You can find more information in the [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) blog post.
+This formula serves as a starting point. Make sure to test it with a representative workload. See the OpenSearch blog post [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) for more information.
{: .note }
-#### (Advanced settings) Set a shingle size
+### Setting a shingle size
-Set the number of aggregation intervals from your data stream to consider in a detection window. It’s best to choose this value based on your actual data to see which one leads to the best results for your use case.
+In the **Advanced settings** pane, you can set the number of data stream aggregation intervals to include in the detection window. Choose this value based on your actual data to find the optimal setting for your use case. To set the shingle size, select **Show** in the **Advanced settings** pane. Enter the desired size in the **intervals** field.
-The anomaly detector expects the shingle size to be in the range of 1 and 60. The default shingle size is 8. We recommend that you don't choose 1 unless you have two or more features. Smaller values might increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also false positives. Larger values might be useful for ignoring noise in a signal.
+The anomaly detector requires the shingle size to be between 1 and 128. The default is `8`. Use `1` only if you have at least two features. Values of less than `8` may increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also may increase false positives. Values greater than `8` may be useful for ignoring noise in a signal.
-#### Preview sample anomalies
+### Setting an imputation option
-Preview sample anomalies and adjust the feature settings if needed.
-For sample previews, the Anomaly Detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results.
+In the **Advanced settings** pane, you can set the imputation option. This allows you to manage missing data in your streams. The options include the following:
-Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results.
+- **Ignore Missing Data (Default):** The system continues without considering missing data points, keeping the existing data flow.
+- **Fill with Custom Values:** Specify a custom value for each feature to replace missing data points, allowing for targeted imputation tailored to your data.
+- **Fill with Zeros:** Replace missing values with zeros. This is ideal when the absence of data indicates a significant event, such as a drop to zero in event counts.
+- **Use Previous Values:** Fill gaps with the last observed value to maintain continuity in your time-series data. This method treats missing data as non-anomalous, carrying forward the previous trend.
-1. Choose **Preview sample anomalies**.
- - If you don't see any sample anomaly result, check the detector interval and make sure you have more than 400 data points for some entities during the preview date range.
-1. Choose **Next**.
+Using these options can improve recall in anomaly detection. For instance, if you are monitoring for drops in event counts, including both partial and complete drops, then filling missing values with zeros helps detect significant data absences, improving detection recall.
+
+Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information.
+{: note}
+
+### Suppressing anomalies with threshold-based rules
+
+In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations.
+
+Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns.
+
+To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules:
-## Step 3: Set up detector jobs
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value.
+```
+
+Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features.
+{: .note}
+
+If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds:
-To start a real-time detector to find anomalies in your data in near real-time, check **Start real-time detector automatically (recommended)**.
+```
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value.
+Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value.
+```
-Alternatively, if you want to perform historical analysis and find patterns in long historical data windows (weeks or months), check **Run historical analysis detection** and select a date range (at least 128 detection intervals).
+If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature.
-Analyzing historical data helps you get familiar with the Anomaly Detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it.
+### Previewing sample anomalies
-We recommend experimenting with historical analysis with different feature sets and checking the precision before moving on to real-time detectors.
+You can preview anomalies based on sample feature input and adjust the feature settings as needed. The Anomaly Detection plugin selects a small number of data samples---for example, 1 data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. The sample dataset is loaded into the detector, which then uses the sample dataset to generate a preview of the anomalies.
+
+1. Choose **Preview sample anomalies**.
+ - If sample anomaly results are not displayed, check the detector interval to verify that 400 or more data points are set for the entities during the preview date range.
+2. Select the **Next** button.
-## Step 4: Review and create
+## Step 3: Setting up detector jobs
-Review your detector settings and model configurations to make sure that they're valid and then select **Create detector**.
+To start a detector to find anomalies in your data in near real time, select **Start real-time detector automatically (recommended)**.
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/review_ad.png)
+Alternatively, if you want to perform historical analysis and find patterns in longer historical data windows (weeks or months), select the **Run historical analysis detection** box and select a date range of at least 128 detection intervals.
-If you see any validation errors, edit the settings to fix the errors and then return back to this page.
+Analyzing historical data can help to familiarize you with the Anomaly Detection plugin. For example, you can evaluate the performance of a detector against historical data in order to fine-tune it.
+
+You can experiment with historical analysis by using different feature sets and checking the precision before using real-time detectors.
+
+## Step 4: Reviewing detector settings
+
+Review your detector settings and model configurations to confirm that they are valid and then select **Create detector**.
+
+If a validation error occurs, edit the settings to correct the error and return to the detector page.
{: .note }
-## Step 5: Observe the results
+## Step 5: Observing the results
-Choose the **Real-time results** or **Historical analysis** tab. For real-time results, you need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, because its waiting for sufficient data to generate anomalies.
+Choose either the **Real-time results** or **Historical analysis** tab. For real-time results, it will take some time to display the anomaly results. For example, if the detector interval is 10 minutes, then the detector may take an hour to initiate because it is waiting for sufficient data to be able to generate anomalies.
-A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner.
-Use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to make sure you have sufficient data points.
+A shorter interval results in the model passing the shingle process more quickly and generating anomaly results sooner. You can use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to ensure that you have enough data points.
-If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval.
+If the detector is pending in "initialization" for longer than 1 day, aggregate your existing data and use the detector interval to check for any missing data points. If you find many missing data points, consider increasing the detector interval.
-Choose and drag over the anomaly line chart to zoom in and see a more detailed view of an anomaly.
+Click and drag over the anomaly line chart to zoom in and see a detailed view of an anomaly.
{: .note }
-Analyze anomalies with the following visualizations:
+You can analyze anomalies using the following visualizations:
-- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
-- **Anomaly overview** (for real-time results) / **Anomaly history** (for historical analysis in the **Historical analysis** tab) plots the anomaly grade with the corresponding measure of confidence. This pane includes:
+- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is `10`, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
+- **Anomaly overview** (for real-time results) or **Anomaly history** (for historical analysis on the **Historical analysis** tab) plot the anomaly grade with the corresponding measure of confidence. The pane includes:
- The number of anomaly occurrences based on the given data-time range.
- - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly.
+ - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of `0` represents "not an anomaly," and a non-zero value represents the relative severity of the anomaly.
- **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy.
- **Last anomaly occurrence** is the time at which the last anomaly occurred.
-Underneath **Anomaly overview**/**Anomaly history** are:
+Underneath **Anomaly overview** or **Anomaly history** are:
- **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal.
- ![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
- **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly.
Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly
-![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png)
-
-
If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0).
@@ -249,7 +296,7 @@ To see all the configuration settings for a detector, choose the **Detector conf
1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**.
- You need to stop real-time and historical analysis to change its configuration. Confirm that you want to stop the detector and proceed.
-1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
+2. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
## Step 8: Manage your detectors
diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md
index 7e1482a013..967b185684 100644
--- a/_observing-your-data/ad/result-mapping.md
+++ b/_observing-your-data/ad/result-mapping.md
@@ -9,9 +9,7 @@ redirect_from:
# Anomaly result mapping
-If you enabled custom result index, the anomaly detection plugin stores the results in your own index.
-
-If the anomaly detector doesn’t detect an anomaly, the result has the following format:
+When you select the **Enable custom result index** box on the **Custom result index** pane, the Anomaly Detection plugin will save the results to an index of your choosing. When the anomaly detector does not detect an anomaly, the result format is as follows:
```json
{
@@ -61,6 +59,7 @@ If the anomaly detector doesn’t detect an anomaly, the result has the followin
"threshold": 1.2368549346675202
}
```
+{% include copy-curl.html %}
## Response body fields
@@ -80,7 +79,83 @@ Field | Description
`model_id` | A unique ID that identifies a model. If a detector is a single-stream detector (with no category field), it has only one model. If a detector is a high-cardinality detector (with one or more category fields), it might have multiple models, one for each entity.
`threshold` | One of the criteria for a detector to classify a data point as an anomaly is that its `anomaly_score` must surpass a dynamic threshold. This field records the current threshold.
-If an anomaly detector detects an anomaly, the result has the following format:
+When the imputation option is enabled, the anomaly results include a `feature_imputed` array showing which features were modified due to missing data. If no features were imputed, then this is excluded.
+
+In the following example anomaly result output, the `processing_bytes_max` feature was imputed, as shown by the `imputed: true` status:
+
+```json
+{
+ "detector_id": "kzcZ43wBgEQAbjDnhzGF",
+ "schema_version": 5,
+ "data_start_time": 1635898161367,
+ "data_end_time": 1635898221367,
+ "feature_data": [
+ {
+ "feature_id": "processing_bytes_max",
+ "feature_name": "processing bytes max",
+ "data": 2322
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "feature_name": "processing bytes avg",
+ "data": 1718.6666666666667
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "feature_name": "processing bytes min",
+ "data": 1375
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "feature_name": "processing bytes sum",
+ "data": 5156
+ },
+ {
+ "feature_id": "processing_time_max",
+ "feature_name": "processing time max",
+ "data": 31198
+ }
+ ],
+ "execution_start_time": 1635898231577,
+ "execution_end_time": 1635898231622,
+ "anomaly_score": 1.8124904404395776,
+ "anomaly_grade": 0,
+ "confidence": 0.9802940756605277,
+ "entity": [
+ {
+ "name": "process_name",
+ "value": "process_3"
+ }
+ ],
+ "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_process_3",
+ "threshold": 1.2368549346675202,
+ "feature_imputed": [
+ {
+ "feature_id": "processing_bytes_max",
+ "imputed": true
+ },
+ {
+ "feature_id": "processing_bytes_avg",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_min",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_bytes_sum",
+ "imputed": false
+ },
+ {
+ "feature_id": "processing_time_max",
+ "imputed": false
+ }
+ ]
+}
+```
+{% include copy-curl.html %}
+
+When an anomaly is detected, the result is provided in the following format:
```json
{
@@ -179,24 +254,23 @@ If an anomaly detector detects an anomaly, the result has the following format:
"execution_start_time": 1635898427803
}
```
+{% include copy-curl.html %}
-You can see the following additional fields:
+Note that the result includes the following additional field.
Field | Description
:--- | :---
`relevant_attribution` | Represents the contribution of each input variable. The sum of the attributions is normalized to 1.
`expected_values` | The expected value for each feature.
-At times, the detector might detect an anomaly late.
-Let's say the detector sees a random mix of the triples {1, 2, 3} and {2, 4, 5} that correspond to `slow weeks` and `busy weeks`, respectively. For example 1, 2, 3, 1, 2, 3, 2, 4, 5, 1, 2, 3, 2, 4, 5, ... and so on.
-If the detector comes across a pattern {2, 2, X} and it's yet to see X, the detector infers that the pattern is anomalous, but it can't determine at this point which of the 2's is the cause. If X = 3, then the detector knows it's the first 2 in that unfinished triple, and if X = 5, then it's the second 2. If it's the first 2, then the detector detects the anomaly late.
+The detector may be late in detecting an anomaly. For example: The detector observes a sequence of data that alternates between "slow weeks" (represented by the triples {1, 2, 3}) and "busy weeks" (represented by the triples {2, 4, 5}). If the detector comes across a pattern {2, 2, X}, where it has not yet seen the value that X will take, then the detector infers that the pattern is anomalous. However, it cannot determine which 2 is the cause. If X = 3, then the first 2 is the anomaly. If X = 5, then the second 2 is the anomaly. If it is the first 2, then the detector will be late in detecting the anomaly.
-If a detector detects an anomaly late, the result has the following additional fields:
+When a detector is late in detecting an anomaly, the result includes the following additional fields.
Field | Description
:--- | :---
-`past_values` | The actual input that triggered an anomaly. If `past_values` is null, the attributions or expected values are from the current input. If `past_values` is not null, the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
-`approx_anomaly_start_time` | The approximate time of the actual input that triggers an anomaly. This field helps you understand when a detector flags an anomaly. Both single-stream and high-cardinality detectors don't query previous anomaly results because these queries are expensive operations. The cost is especially high for high-cardinality detectors that might have a lot of entities. If the data is not continuous, the accuracy of this field is low and the actual time that the detector detects an anomaly can be earlier.
+`past_values` | The actual input that triggered an anomaly. If `past_values` is `null`, then the attributions or expected values are from the current input. If `past_values` is not `null`, then the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]).
+`approx_anomaly_start_time` | The approximate time of the actual input that triggered an anomaly. This field helps you understand the time at which a detector flags an anomaly. Both single-stream and high-cardinality detectors do not query previous anomaly results because these queries are costly operations. The cost is especially high for high-cardinality detectors that may have many entities. If the data is not continuous, then the accuracy of this field is low and the actual time at which the detector detects an anomaly can be earlier.
```json
{
@@ -319,3 +393,4 @@ Field | Description
"approx_anomaly_start_time": 1635883620000
}
```
+{% include copy-curl.html %}
diff --git a/_observing-your-data/ad/security.md b/_observing-your-data/ad/security.md
index 8eeaa3df41..e4816cec46 100644
--- a/_observing-your-data/ad/security.md
+++ b/_observing-your-data/ad/security.md
@@ -23,6 +23,11 @@ As an admin user, you can use the Security plugin to assign specific permissions
The Security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles).
+If you use OpenSearch Dashboards to create your anomaly detectors, you may experience access issues even with `anomaly_full_access`. This issue has been resolved in OpenSearch 2.17, but for earlier versions, the following additional permissions need to be added:
+
+- `indices:data/read/search` -- You need this permission because the Anomaly Detection plugin needs to search the data source in order to validate whether there is enough data to train the model.
+- `indices:admin/mappings/fields/get` and `indices:admin/mappings/fields/get*` -- You need these permissions to validate whether the given data source has a valid timestamp field and categorical field (in the case of creating a high-cardinality detector).
+
If these roles don't meet your needs, mix and match individual anomaly detection [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors.
### A note on alerts and fine-grained access control
@@ -31,6 +36,42 @@ When a trigger generates an alert, the detector and monitor configurations, the
To reduce the chances of unintended users viewing metadata that could describe an index, we recommend that administrators enable role-based access control and keep these kinds of design elements in mind when assigning permissions to the intended group of users. See [Limit access by backend role](#advanced-limit-access-by-backend-role) for details.
+### Selecting remote indexes with fine-grained access control
+
+To use a remote index as a data source for a detector, see the setup steps in [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) in [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). You must use a role that exists in both the remote and local clusters. The remote cluster must map the chosen role to the same username as in the local cluster.
+
+---
+
+#### Example: Create a new user on the local cluster
+
+1. Create a new user on the local cluster to use for detector creation:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9200/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+```
+{% include copy-curl.html %}
+
+2. Map the new user to the `anomaly_full_access` role:
+
+```
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9200/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+3. On the remote cluster, create the same user and map `anomaly_full_access` to that role:
+
+```
+curl -XPUT -k -u 'admin:' 'https://localhost:9250/_plugins/_security/api/internalusers/anomalyuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
+curl -XPUT -k -u 'admin:' -H 'Content-Type: application/json' 'https://localhost:9250/_plugins/_security/api/rolesmapping/anomaly_full_access' -d '{"users" : ["anomalyuser"]}'
+```
+{% include copy-curl.html %}
+
+---
+
+### Custom results index
+
+To use a custom results index, you need additional permissions not included in the default roles provided by the OpenSearch Security plugin. To add these permissions, see [Step 1: Define a detector]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-1-define-a-detector) in the [Anomaly detection]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/) documentation.
+
## (Advanced) Limit access by backend role
Use backend roles to configure fine-grained access to individual detectors based on roles. For example, users of different departments in an organization can view detectors owned by their own department.
diff --git a/_observing-your-data/query-insights/grouping-top-n-queries.md b/_observing-your-data/query-insights/grouping-top-n-queries.md
new file mode 100644
index 0000000000..28cbcbb8e5
--- /dev/null
+++ b/_observing-your-data/query-insights/grouping-top-n-queries.md
@@ -0,0 +1,331 @@
+---
+layout: default
+title: Grouping top N queries
+parent: Query insights
+nav_order: 20
+---
+
+# Grouping top N queries
+**Introduced 2.17**
+{: .label .label-purple }
+
+Monitoring the [top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/) can help you to identify the most resource-intensive queries based on latency, CPU, and memory usage in a specified time window. However, if a single computationally expensive query is executed multiple times, it can occupy all top N query slots, potentially preventing other expensive queries from appearing in the list. To address this issue, you can group similar queries, gaining insight into various high-impact query groups.
+
+Starting with OpenSearch version 2.17, the top N queries can be grouped by `similarity`, with additional grouping options planned for future version releases.
+
+## Grouping queries by similarity
+
+Grouping queries by `similarity` organizes them based on the query structure, removing everything except the core query operations.
+
+For example, the following query:
+
+```json
+{
+ "query": {
+ "bool": {
+ "must": [
+ { "exists": { "field": "field1" } }
+ ],
+ "query_string": {
+ "query": "search query"
+ }
+ }
+ }
+}
+```
+
+Has the following corresponding query structure:
+
+```c
+bool
+ must
+ exists
+ query_string
+```
+
+When queries share the same query structure, they are grouped together, ensuring that all similar queries belong to the same group.
+
+
+## Aggregate metrics per group
+
+In addition to retrieving latency, CPU, and memory metrics for individual top N queries, you can obtain aggregate statistics for the
+top N query groups. For each query group, the response includes the following statistics:
+- The total latency, CPU usage, or memory usage (depending on the configured metric type)
+- The total query count
+
+Using these statistics, you can calculate the average latency, CPU usage, or memory usage for each query group.
+The response also includes one example query from the query group.
+
+## Configuring query grouping
+
+Before you enable query grouping, you must enable top N query monitoring for a metric type of your choice. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+To configure grouping for top N queries, use the following steps.
+
+### Step 1: Enable top N query monitoring
+
+Ensure that top N query monitoring is enabled for at least one of the metrics: latency, CPU, or memory. For more information, see [Configuring top N query monitoring]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/#configuring-top-n-query-monitoring).
+
+For example, to enable top N query monitoring by latency with the default settings, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.latency.enabled" : true
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Step 2: Configure query grouping
+
+Set the desired grouping method by updating the following cluster setting:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.group_by" : "similarity"
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for the `group_by` setting is `none`, which disables grouping. As of OpenSearch 2.17, the supported values for `group_by` are `similarity` and `none`.
+
+### Step 3 (Optional): Limit the number of monitored query groups
+
+Optionally, you can limit the number of monitored query groups. Queries already included in the top N query list (the most resource-intensive queries) will not be considered in determining the limit. Essentially, the maximum applies only to other query groups, and the top N queries are tracked separately. This helps manage the tracking of query groups based on workload and query window size.
+
+To limit tracking to 100 query groups, send the following request:
+
+```json
+PUT _cluster/settings
+{
+ "persistent" : {
+ "search.insights.top_queries.max_groups_excluding_topn" : 100
+ }
+}
+```
+{% include copy-curl.html %}
+
+The default value for `max_groups_excluding_topn` is `100`, and you can set it to any value between `0` and `10,000`, inclusive.
+
+## Monitoring query groups
+
+To view the top N query groups, send the following request:
+
+```json
+GET /_insights/top_queries
+```
+{% include copy-curl.html %}
+
+The response contains the top N query groups:
+
+
+
+ Response
+
+ {: .text-delta}
+
+```json
+{
+ "top_queries": [
+ {
+ "timestamp": 1725495127359,
+ "source": {
+ "query": {
+ "match_all": {
+ "boost": 1.0
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 55,
+ "fetch": 3
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "b4c4f69290df756021ca6276be5cbb75",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 30,
+ "parentTaskId": 29,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 33249000,
+ "memory_in_bytes": 2896848
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 29,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 3151000,
+ "memory_in_bytes": 133936
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 160,
+ "count": 10,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495135160,
+ "source": {
+ "query": {
+ "term": {
+ "content": {
+ "value": "first",
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 18,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "c3620cc3d4df30fb3f95aeb2167289a4",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 50,
+ "parentTaskId": 49,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 10188000,
+ "memory_in_bytes": 288136
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 49,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 262000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 109,
+ "count": 7,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ },
+ {
+ "timestamp": 1725495139766,
+ "source": {
+ "query": {
+ "match": {
+ "content": {
+ "query": "first",
+ "operator": "OR",
+ "prefix_length": 0,
+ "max_expansions": 50,
+ "fuzzy_transpositions": true,
+ "lenient": false,
+ "zero_terms_query": "NONE",
+ "auto_generate_synonyms_phrase_query": true,
+ "boost": 1.0
+ }
+ }
+ }
+ },
+ "phase_latency_map": {
+ "expand": 0,
+ "query": 15,
+ "fetch": 0
+ },
+ "total_shards": 1,
+ "node_id": "ZbINz1KFS1OPeFmN-n5rdg",
+ "query_hashcode": "484eaabecd13db65216b9e2ff5eee999",
+ "task_resource_usages": [
+ {
+ "action": "indices:data/read/search[phase/query]",
+ "taskId": 64,
+ "parentTaskId": 63,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 12161000,
+ "memory_in_bytes": 473456
+ }
+ },
+ {
+ "action": "indices:data/read/search",
+ "taskId": 63,
+ "parentTaskId": -1,
+ "nodeId": "ZbINz1KFS1OPeFmN-n5rdg",
+ "taskResourceUsage": {
+ "cpu_time_in_nanos": 293000,
+ "memory_in_bytes": 3216
+ }
+ }
+ ],
+ "indices": [
+ "my_index"
+ ],
+ "labels": {},
+ "search_type": "query_then_fetch",
+ "measurements": {
+ "latency": {
+ "number": 43,
+ "count": 3,
+ "aggregationType": "AVERAGE"
+ }
+ }
+ }
+ ]
+}
+```
+
+
+
+## Response fields
+
+The response includes the following fields.
+
+Field | Data type | Description
+:--- |:---| :---
+`top_queries` | Array | The list of top query groups.
+`top_queries.timestamp` | Integer | The execution timestamp for the first query in the query group.
+`top_queries.source` | Object | The first query in the query group.
+`top_queries.phase_latency_map` | Object | The phase latency map for the first query in the query group. The map includes the amount of time, in milliseconds, that the query spent in the `expand`, `query`, and `fetch` phases.
+`top_queries.total_shards` | Integer | The number of shards on which the first query was executed.
+`top_queries.node_id` | String | The node ID of the node that coordinated the execution of the first query in the query group.
+`top_queries.query_hashcode` | String | The hash code that uniquely identifies the query group. This is essentially the hash of the [query structure](#grouping-queries-by-similarity).
+`top_queries.task_resource_usages` | Array of objects | The resource usage breakdown for the various tasks belonging to the first query in the query group.
+`top_queries.indices` | Array | The indexes searched by the first query in the query group.
+`top_queries.labels` | Object | Used to label the top query.
+`top_queries.search_type` | String | The search request execution type (`query_then_fetch` or `dfs_query_then_fetch`). For more information, see the `search_type` parameter in the [Search API documentation]({{site.url}}{{site.baseurl}}/api-reference/search/#url-parameters).
+`top_queries.measurements` | Object | The aggregate measurements for the query group.
+`top_queries.measurements.latency` | Object | The aggregate latency measurements for the query group.
+`top_queries.measurements.latency.number` | Integer | The total latency for the query group.
+`top_queries.measurements.latency.count` | Integer | The number of queries in the query group.
+`top_queries.measurements.latency.aggregationType` | String | The aggregation type for the current entry. If grouping by similarity is enabled, then `aggregationType` is `AVERAGE`. If it is not enabled, then `aggregationType` is `NONE`.
\ No newline at end of file
diff --git a/_observing-your-data/query-insights/index.md b/_observing-your-data/query-insights/index.md
index 549371240f..ef3a65bfcd 100644
--- a/_observing-your-data/query-insights/index.md
+++ b/_observing-your-data/query-insights/index.md
@@ -7,8 +7,10 @@ has_toc: false
---
# Query insights
+**Introduced 2.12**
+{: .label .label-purple }
-To monitor and analyze the search queries within your OpenSearch clusterQuery information, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
+To monitor and analyze the search queries within your OpenSearch cluster, you can obtain query insights. With minimal performance impact, query insights features aim to provide comprehensive insights into search query execution, enabling you to better understand search query characteristics, patterns, and system behavior during query execution stages. Query insights facilitate enhanced detection, diagnosis, and prevention of query performance issues, ultimately improving query processing performance, user experience, and overall system resilience.
Typical use cases for query insights features include the following:
@@ -36,4 +38,5 @@ For information about installing plugins, see [Installing plugins]({{site.url}}{
You can obtain the following information using Query Insights:
- [Top n queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/top-n-queries/)
+- [Grouping top N queries]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/grouping-top-n-queries/)
- [Query metrics]({{site.url}}{{site.baseurl}}/observing-your-data/query-insights/query-metrics/)
diff --git a/_observing-your-data/query-insights/query-metrics.md b/_observing-your-data/query-insights/query-metrics.md
index c8caf21d65..beac8d4e18 100644
--- a/_observing-your-data/query-insights/query-metrics.md
+++ b/_observing-your-data/query-insights/query-metrics.md
@@ -2,10 +2,12 @@
layout: default
title: Query metrics
parent: Query insights
-nav_order: 20
+nav_order: 30
---
# Query metrics
+**Introduced 2.16**
+{: .label .label-purple }
Key query [metrics](#metrics), such as aggregation types, query types, latency, and resource usage per query type, are captured along the search path by using the OpenTelemetry (OTel) instrumentation framework. The telemetry data can be consumed using OTel metrics [exporters]({{site.url}}{{site.baseurl}}/observing-your-data/trace/distributed-tracing/#exporters).
diff --git a/_observing-your-data/query-insights/top-n-queries.md b/_observing-your-data/query-insights/top-n-queries.md
index f07fd2dfef..b63d670926 100644
--- a/_observing-your-data/query-insights/top-n-queries.md
+++ b/_observing-your-data/query-insights/top-n-queries.md
@@ -7,7 +7,7 @@ nav_order: 10
# Top N queries
-Monitoring the top N queries in query insights features can help you gain real-time insights into the top queries with high latency within a certain time frame (for example, the last hour).
+Monitoring the top N queries using query insights allows you to gain real-time visibility into the queries with the highest latency or resource consumption in a specified time period (for example, the last hour).
## Configuring top N query monitoring
@@ -72,14 +72,14 @@ PUT _cluster/settings
## Monitoring the top N queries
-You can use the Insights API endpoint to obtain the top N queries for all metric types:
+You can use the Insights API endpoint to retrieve the top N queries. This API returns top N `latency` results by default.
```json
GET /_insights/top_queries
```
{% include copy-curl.html %}
-Specify a metric type to filter the response:
+Specify the `type` parameter to retrieve the top N results for other metric types. The results will be sorted in descending order based on the specified metric type.
```json
GET /_insights/top_queries?type=latency
@@ -96,6 +96,9 @@ GET /_insights/top_queries?type=memory
```
{% include copy-curl.html %}
+If your query returns no results, ensure that top N query monitoring is enabled for the target metric type and that search requests were made within the current [time window](#configuring-the-window-size).
+{: .important}
+
## Exporting top N query data
You can configure your desired exporter to export top N query data to different sinks, allowing for better monitoring and analysis of your OpenSearch queries. Currently, the following exporters are supported:
diff --git a/_query-dsl/geo-and-xy/geo-bounding-box.md b/_query-dsl/geo-and-xy/geo-bounding-box.md
index 1112a4278e..66fcc224d6 100644
--- a/_query-dsl/geo-and-xy/geo-bounding-box.md
+++ b/_query-dsl/geo-and-xy/geo-bounding-box.md
@@ -173,11 +173,11 @@ GET testindex1/_search
```
{% include copy-curl.html %}
-## Request fields
+## Parameters
-Geo-bounding box queries accept the following fields.
+Geo-bounding box queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geodistance.md b/_query-dsl/geo-and-xy/geodistance.md
index b272cad81e..3eef58bc69 100644
--- a/_query-dsl/geo-and-xy/geodistance.md
+++ b/_query-dsl/geo-and-xy/geodistance.md
@@ -103,11 +103,11 @@ The response contains the matching document:
}
```
-## Request fields
+## Parameters
-Geodistance queries accept the following fields.
+Geodistance queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`distance` | String | The distance within which to match the points. This distance is the radius of a circle centered at the specified point. For supported distance units, see [Distance units]({{site.url}}{{site.baseurl}}/api-reference/common-parameters/#distance-units). Required.
diff --git a/_query-dsl/geo-and-xy/geopolygon.md b/_query-dsl/geo-and-xy/geopolygon.md
index 980a0c5a63..810e48f2b7 100644
--- a/_query-dsl/geo-and-xy/geopolygon.md
+++ b/_query-dsl/geo-and-xy/geopolygon.md
@@ -161,11 +161,11 @@ However, if you specify the vertices in the following order:
The response returns no results.
-## Request fields
+## Parameters
-Geopolygon queries accept the following fields.
+Geopolygon queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`_name` | String | The name of the filter. Optional.
`validation_method` | String | The validation method. Valid values are `IGNORE_MALFORMED` (accept geopoints with invalid coordinates), `COERCE` (try to coerce coordinates to valid values), and `STRICT` (return an error when coordinates are invalid). Optional. Default is `STRICT`.
diff --git a/_query-dsl/geo-and-xy/geoshape.md b/_query-dsl/geo-and-xy/geoshape.md
index 42948666f4..5b144b06d6 100644
--- a/_query-dsl/geo-and-xy/geoshape.md
+++ b/_query-dsl/geo-and-xy/geoshape.md
@@ -25,15 +25,15 @@ Relation | Description | Supporting geographic field type
## Defining the shape in a geoshape query
-You can define the shape to filter documents in a geoshape query either by providing a new shape definition at query time or by referencing the name of a shape pre-indexed in another index.
+You can define the shape to filter documents in a geoshape query either by [providing a new shape definition at query time](#using-a-new-shape-definition) or by [referencing the name of a shape pre-indexed in another index](#using-a-pre-indexed-shape-definition).
-### Using a new shape definition
+## Using a new shape definition
To provide a new shape to a geoshape query, define it in the `geo_shape` field. You must define the geoshape in [GeoJSON format](https://geojson.org/).
The following example illustrates searching for documents containing geoshapes that match a geoshape defined at query time.
-#### Step 1: Create an index
+### Step 1: Create an index
First, create an index and map the `location` field as a `geo_shape`:
@@ -422,7 +422,7 @@ GET /testindex/_search
Geoshape queries whose geometry collection contains a linestring or a multilinestring do not support the `WITHIN` relation.
{: .note}
-### Using a pre-indexed shape definition
+## Using a pre-indexed shape definition
When constructing a geoshape query, you can also reference the name of a shape pre-indexed in another index. Using this method, you can define a geoshape at index time and refer to it by name at search time.
@@ -721,10 +721,10 @@ The response returns document 1:
Note that when you indexed the geopoints, you specified their coordinates in `"latitude, longitude"` format. When you search for matching documents, the coordinate array is in `[longitude, latitude]` format. Thus, document 1 is returned in the results but document 2 is not.
-## Request fields
+## Parameters
-Geoshape queries accept the following fields.
+Geoshape queries accept the following parameters.
-Field | Data type | Description
+Parameter | Data type | Description
:--- | :--- | :---
`ignore_unmapped` | Boolean | Specifies whether to ignore an unmapped field. If set to `true`, then the query does not return any documents that contain an unmapped field. If set to `false`, then an exception is thrown when the field is unmapped. Optional. Default is `false`.
\ No newline at end of file
diff --git a/_query-dsl/joining/has-child.md b/_query-dsl/joining/has-child.md
new file mode 100644
index 0000000000..c7da5bf7a9
--- /dev/null
+++ b/_query-dsl/joining/has-child.md
@@ -0,0 +1,398 @@
+---
+layout: default
+title: Has child
+parent: Joining queries
+nav_order: 10
+---
+
+# Has child query
+
+The `has_child` query returns parent documents whose child documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_child` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching child documents pointing to different parent documents increases. Each `has_child` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_child` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+In this example, you'll configure an index that contains documents representing products and their brands.
+
+First, create the index and establish the parent/child relationship between `brand` and `product`:
+
+```json
+PUT testindex1
+{
+ "mappings": {
+ "properties": {
+ "product_to_brand": {
+ "type": "join",
+ "relations": {
+ "brand": "product"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Index two parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy brand",
+ "product_to_brand" : "brand"
+}
+```
+{% include copy-curl.html %}
+
+Index three child (product) documents:
+
+```json
+PUT testindex1/_doc/3?routing=1
+{
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/4?routing=2
+{
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/5?routing=2
+{
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search for the parent of a child, use a `has_child` query. The following query returns parent documents (brands) that make watches:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns both brands:
+
+```json
+{
+ "took": 15,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return child documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_child": {
+ "type":"product",
+ "query": {
+ "match" : {
+ "name": "watch"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains child documents in the `inner_hits` field:
+
+```json
+{
+ "took": 52,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 1,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.53899646,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ },
+ "inner_hits": {
+ "product": {
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 0.53899646,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 0.53899646,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_child` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on child documents. If a child document matches the query, the parent document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
+| `max_children` | Optional | The maximum number of matching child documents for a parent document. If exceeded, the parent document is excluded from the search results. |
+| `min_children` | Optional | The minimum number of matching child documents required for a parent document to be included in the results. If not met, the parent is excluded. Default is `1`.|
+| `score_mode` | Optional | Defines how scores of matching child documents influence the parent document's score. Valid values are:
- `none`: Ignores the relevance scores of child documents and assigns a score of `0` to the parent document.
- `avg`: Uses the average relevance score of all matching child documents.
- `max`: Assigns the highest relevance score from the matching child documents to the parent.
- `min`: Assigns the lowest relevance score from the matching child documents to the parent.
- `sum`: Sums the relevance scores of all matching child documents.
Default is `none`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (child documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_child` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort parent documents by fields in their child documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the parent document's score.
+
+In the preceding example, you can sort parent documents (brands) based on the `sales_count` of their child products. This query multiplies the score by the `sales_count` field of the child documents and assigns the highest relevance score from the matching child documents to the parent:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_child": {
+ "type": "product",
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['sales_count'].value"
+ }
+ }
+ },
+ "score_mode": "max"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the brands sorted by the highest child `sales_count`:
+
+```json
+{
+ "took": 6,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 300,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 300,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "1",
+ "_score": 150,
+ "_source": {
+ "name": "Luxury brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/has-parent.md b/_query-dsl/joining/has-parent.md
new file mode 100644
index 0000000000..6b293ffff2
--- /dev/null
+++ b/_query-dsl/joining/has-parent.md
@@ -0,0 +1,358 @@
+---
+layout: default
+title: Has parent
+parent: Joining queries
+nav_order: 20
+---
+
+# Has parent query
+
+The `has_parent` query returns child documents whose parent documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+The `has_parent` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching parent documents increases. Each `has_parent` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible.
+{: .warning}
+
+## Example
+
+Before you can run a `has_parent` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for the child of a parent, use a `has_parent` query. The following query returns child documents (products) made by the brand matching the query `economy`:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns all products made by the brand:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return parent documents that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET testindex1/_search
+{
+ "query" : {
+ "has_parent": {
+ "parent_type":"brand",
+ "query": {
+ "match" : {
+ "name": "economy"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains parent documents in the `inner_hits` field:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 2,
+ "relation": "eq"
+ },
+ "max_score": 1,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 1,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ },
+ "inner_hits": {
+ "brand": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "2",
+ "_score": 1.3862942,
+ "_source": {
+ "name": "Economy brand",
+ "product_to_brand": "brand"
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Parameters
+
+The following table lists all top-level parameters supported by `has_parent` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `parent_type` | Required | Specifies the name of the parent relationship as defined in the `join` field mapping. |
+| `query` | Required | The query to run on parent documents. If a parent document matches the query, the child document is returned. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `parent_type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `parent_type` field. Default is `false`. |
+| `score` | Optional | Indicates whether the relevance score of a matching parent document is aggregated into its child documents. If `false`, then the relevance score of the parent document is ignored, and each child document is assigned a relevance score equal to the query's `boost`, which defaults to `1`. If `true`, then the relevance score of the matching parent document is aggregated into the relevance scores of its child documents. Default is `false`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits (parent documents) that matched the query. |
+
+
+## Sorting limitations
+
+The `has_parent` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort child documents by fields in their parent documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the child document's score.
+
+For the preceding example, first add a `customer_satisfaction` field by which you'll sort the child documents belonging to the parent (brand) documents:
+
+```json
+PUT testindex1/_doc/1
+{
+ "name": "Luxury watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 4.5
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT testindex1/_doc/2
+{
+ "name": "Economy watch brand",
+ "product_to_brand" : "brand",
+ "customer_satisfaction": 3.9
+}
+```
+{% include copy-curl.html %}
+
+Now you can sort child documents (products) based on the `customer_satisfaction` field of their parent brands. This query multiplies the score by the `customer_satisfaction` field of the parent documents:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "has_parent": {
+ "parent_type": "brand",
+ "score": true,
+ "query": {
+ "function_score": {
+ "script_score": {
+ "script": "_score * doc['customer_satisfaction'].value"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the products, sorted by the highest parent `customer_satisfaction`:
+
+```json
+{
+ "took": 11,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 3,
+ "relation": "eq"
+ },
+ "max_score": 4.5,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 4.5,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "4",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Electronic watch",
+ "sales_count": 300,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ },
+ {
+ "_index": "testindex1",
+ "_id": "5",
+ "_score": 3.9,
+ "_routing": "2",
+ "_source": {
+ "name": "Digital watch",
+ "sales_count": 100,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "2"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/index.md b/_query-dsl/joining/index.md
index 20f48c0b16..f0a0060640 100644
--- a/_query-dsl/joining/index.md
+++ b/_query-dsl/joining/index.md
@@ -3,16 +3,22 @@ layout: default
title: Joining queries
has_children: true
nav_order: 55
+has_toc: false
+redirect_from:
+ - /query-dsl/joining/
---
# Joining queries
OpenSearch is a distributed system in which data is spread across multiple nodes. Thus, running a SQL-like JOIN operation in OpenSearch is resource intensive. As an alternative, OpenSearch provides the following queries that perform join operations and are optimized for scaling across multiple nodes:
-- `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
-- `has_child` queries: Search for parent documents whose child documents match the query.
-- `has_parent` queries: Search for child documents whose parent documents match the query.
-- `parent_id` queries: A [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field type establishes a parent/child relationship between documents in the same index. `parent_id` queries search for child documents that are joined to a specific parent document.
+
+- Queries for searching [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields:
+ - `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents.
+- Queries for searching documents connected by a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type, which establishes a parent/child relationship between documents in the same index:
+ - [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries: Search for parent documents whose child documents match the query.
+ - [`has_parent`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-parent/) queries: Search for child documents whose parent documents match the query.
+ - [`parent_id`]({{site.url}}{{site.baseurl}}/query-dsl/joining/parent-id/) queries: Search for child documents that are joined to a specific parent document.
If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then joining queries are not executed.
{: .important}
\ No newline at end of file
diff --git a/_query-dsl/joining/nested.md b/_query-dsl/joining/nested.md
new file mode 100644
index 0000000000..431a40ed1a
--- /dev/null
+++ b/_query-dsl/joining/nested.md
@@ -0,0 +1,347 @@
+---
+layout: default
+title: Nested
+parent: Joining queries
+nav_order: 30
+---
+
+# Nested query
+
+The `nested` query acts as a wrapper for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. If an object matches the search, the `nested` query returns the parent document at the root level.
+
+## Example
+
+Before you can run a `nested` query, your index must contain a [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field.
+
+To configure an example index containing nested fields, send the following request:
+
+```json
+PUT /testindex
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "age": {
+ "type": "integer"
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /testindex/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, wrap your query in a `nested` query and provide the `path` to the nested field:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the matching document:
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Retrieving inner hits
+
+To return inner hits that matched the query, provide the `inner_hits` parameter:
+
+```json
+GET /testindex/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "match": {
+ "patient.name": "John"
+ }
+ },
+ "inner_hits": {}
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response contains the additional `inner_hits` field. The `_nested` field identifies the specific inner object from which the inner hit originated. It contains the nested hit and the offset relative to its position in the `_source`. Because of sorting and scoring, the position of the hit objects in `inner_hits` often differs from their original location in the nested object.
+
+By default, the `_source` of the hit objects within `inner_hits` is returned relative to the `_nested` field. In this example, the `_source` within `inner_hits` contains the `name` and `age` fields as opposed to the top-level `_source`, which contains the whole `patient` object:
+
+```json
+{
+ "took": 38,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_score": 0.2876821,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "age": 56
+ }
+ },
+ "inner_hits": {
+ "patient": {
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.2876821,
+ "hits": [
+ {
+ "_index": "testindex",
+ "_id": "1",
+ "_nested": {
+ "field": "patient",
+ "offset": 0
+ },
+ "_score": 0.2876821,
+ "_source": {
+ "name": "John Doe",
+ "age": 56
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+You can disable returning `_source` by configuring the `_source` field in the mappings. For more information, see [Source]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/source/).
+{: .tip}
+
+For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
+
+## Multi-level nested queries
+
+You can search documents that have nested objects inside other nested objects using multi-level nested queries. In this example, you'll query multiple layers of nested fields by specifying a nested query for each level of the hierarchy.
+
+First, create an index with multi-level nested fields:
+
+```json
+PUT /patients
+{
+ "mappings": {
+ "properties": {
+ "patient": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "contacts": {
+ "type": "nested",
+ "properties": {
+ "name": {
+ "type": "text"
+ },
+ "relationship": {
+ "type": "text"
+ },
+ "phone": {
+ "type": "keyword"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index a document into the example index:
+
+```json
+PUT /patients/_doc/1
+{
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+}
+```
+{% include copy-curl.html %}
+
+To search the nested `patient` field, use a multi-level `nested` query. The following query searches for patients whose contact information includes a person named `Jane` with a relationship of `mother`:
+
+```json
+GET /patients/_search
+{
+ "query": {
+ "nested": {
+ "path": "patient",
+ "query": {
+ "nested": {
+ "path": "patient.contacts",
+ "query": {
+ "bool": {
+ "must": [
+ { "match": { "patient.contacts.relationship": "mother" } },
+ { "match": { "patient.contacts.name": "Jane" } }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The query returns the patient who has a contact entry matching these details:
+
+```json
+{
+ "took": 14,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 1.3862942,
+ "hits": [
+ {
+ "_index": "patients",
+ "_id": "1",
+ "_score": 1.3862942,
+ "_source": {
+ "patient": {
+ "name": "John Doe",
+ "contacts": [
+ {
+ "name": "Jane Doe",
+ "relationship": "mother",
+ "phone": "5551111"
+ },
+ {
+ "name": "Joe Doe",
+ "relationship": "father",
+ "phone": "5552222"
+ }
+ ]
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `nested` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `path` | Required | Specifies the path to the nested object that you want to search. |
+| `query` | Required | The query to run on the nested objects within the specified `path`. If a nested object matches the query, the root parent document is returned. You can search nested fields using dot notation, such as `nested_object.subfield`. Multi-level nesting is supported and automatically detected. Thus, an inner `nested` query within another nested query automatically matches the correct nesting level, instead of the root. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `path` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `path` field. Default is `false`. |
+| `score_mode` | Optional | Defines how scores of matching inner documents influence the parent document's score. Valid values are:
- `avg`: Uses the average relevance score of all matching inner documents.
- `max`: Assigns the highest relevance score from the matching inner documents to the parent.
- `min`: Assigns the lowest relevance score from the matching inner documents to the parent.
- `sum`: Sums the relevance scores of all matching inner documents.
- `none`: Ignores the relevance scores of inner documents and assigns a score of `0` to the parent document.
Default is `avg`. |
+| `inner_hits` | Optional | If provided, returns the underlying hits that matched the query. |
+
+## Next steps
+
+- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/).
\ No newline at end of file
diff --git a/_query-dsl/joining/parent-id.md b/_query-dsl/joining/parent-id.md
new file mode 100644
index 0000000000..cbf86a796e
--- /dev/null
+++ b/_query-dsl/joining/parent-id.md
@@ -0,0 +1,96 @@
+---
+layout: default
+title: Parent ID
+parent: Joining queries
+nav_order: 40
+---
+
+# Parent ID query
+
+The `parent_id` query returns child documents whose parent document has the specified ID. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type.
+
+## Example
+
+Before you can run a `parent_id` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format:
+
+```json
+PUT /example_index
+{
+ "mappings": {
+ "properties": {
+ "relationship_field": {
+ "type": "join",
+ "relations": {
+ "parent_doc": "child_doc"
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/).
+
+To search for child documents of a specific parent document, use a `parent_id` query. The following query returns child documents (products) whose parent document has the ID `1`:
+
+```json
+GET testindex1/_search
+{
+ "query": {
+ "parent_id": {
+ "type": "product",
+ "id": "1"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The response returns the child product:
+
+```json
+{
+ "took": 57,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 1,
+ "relation": "eq"
+ },
+ "max_score": 0.87546873,
+ "hits": [
+ {
+ "_index": "testindex1",
+ "_id": "3",
+ "_score": 0.87546873,
+ "_routing": "1",
+ "_source": {
+ "name": "Mechanical watch",
+ "sales_count": 150,
+ "product_to_brand": {
+ "name": "product",
+ "parent": "1"
+ }
+ }
+ }
+ ]
+ }
+}
+```
+
+## Parameters
+
+The following table lists all top-level parameters supported by `parent_id` queries.
+
+| Parameter | Required/Optional | Description |
+|:---|:---|:---|
+| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. |
+| `id` | Required | The ID of the parent document. The query returns child documents associated with this parent document. |
+| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. |
\ No newline at end of file
diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md
index 14b930cdb6..6cd534b87f 100644
--- a/_query-dsl/specialized/neural.md
+++ b/_query-dsl/specialized/neural.md
@@ -35,6 +35,8 @@ Field | Data type | Required/Optional | Description
`min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/).
`filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines.
+`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters).
+`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision).
#### Example request
diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md
index 42c74c0436..7dac6a9619 100644
--- a/_query-dsl/term/terms.md
+++ b/_query-dsl/term/terms.md
@@ -39,6 +39,7 @@ Parameter | Data type | Description
:--- | :--- | :---
`` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`.
## Terms lookup
@@ -250,3 +251,136 @@ Parameter | Data type | Description
`path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required.
`routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed.
`boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0.
+
+## Bitmap filtering
+**Introduced 2.17**
+{: .label .label-purple }
+
+The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering.
+
+The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products.
+
+First, create a `products` index and map `product_id` as a `keyword`:
+
+```json
+PUT /products
+{
+ "mappings": {
+ "properties": {
+ "product_id": { "type": "keyword" }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Next, index three documents that correspond to products:
+
+```json
+PUT students/_doc/1
+{
+ "name": "Product 1",
+ "product_id" : "111"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/2
+{
+ "name": "Product 2",
+ "product_id" : "222"
+}
+```
+{% include copy-curl.html %}
+
+```json
+PUT students/_doc/3
+{
+ "name": "Product 3",
+ "product_id" : "333"
+}
+```
+{% include copy-curl.html %}
+
+To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field:
+
+```json
+PUT /customers
+{
+ "mappings": {
+ "properties": {
+ "customer_filter": {
+ "type": "binary",
+ "store": true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`.
+
+To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme:
+
+```py
+from pyroaring import BitMap
+import base64
+
+# Create a bitmap, serialize it into a byte string, and encode into Base64
+bm = BitMap([111, 222, 333]) # product ids owned by a customer
+encoded = base64.b64encode(BitMap.serialize(bm))
+
+# Convert the Base64-encoded bytes to a string for storage or transmission
+encoded_bm_str = encoded.decode('utf-8')
+
+# Print the encoded bitmap
+print(f"Encoded Bitmap: {encoded_bm_str}")
+```
+{% include copy.html %}
+
+Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer:
+
+```json
+POST customers/_doc/customer123
+{
+ "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ=="
+}
+```
+{% include copy-curl.html %}
+
+Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": {
+ "index": "customers",
+ "id": "customer123",
+ "path": "customer_filter",
+ "store": true
+ },
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`:
+
+```json
+POST /products/_search
+{
+ "query": {
+ "terms": {
+ "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==",
+ "value_type": "bitmap"
+ }
+ }
+}
+```
+{% include copy-curl.html %}
\ No newline at end of file
diff --git a/_sass/color_schemes/odfe.scss b/_sass/color_schemes/odfe.scss
deleted file mode 100644
index f9b2ca02ba..0000000000
--- a/_sass/color_schemes/odfe.scss
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// Brand colors
-//
-
-$white: #FFFFFF;
-
-$grey-dk-300: #241F21; // Error
-$grey-dk-250: mix(white, $grey-dk-300, 12.5%);
-$grey-dk-200: mix(white, $grey-dk-300, 25%);
-$grey-dk-100: mix(white, $grey-dk-300, 50%);
-$grey-dk-000: mix(white, $grey-dk-300, 75%);
-
-$grey-lt-300: #DBDBDB; // Cloud
-$grey-lt-200: mix(white, $grey-lt-300, 25%);
-$grey-lt-100: mix(white, $grey-lt-300, 50%);
-$grey-lt-000: mix(white, $grey-lt-300, 75%);
-
-$blue-300: #00007C; // Meta
-$blue-200: mix(white, $blue-300, 25%);
-$blue-100: mix(white, $blue-300, 50%);
-$blue-000: mix(white, $blue-300, 75%);
-
-$purple-300: #9600FF; // Prpl
-$purple-200: mix(white, $purple-300, 25%);
-$purple-100: mix(white, $purple-300, 50%);
-$purple-000: mix(white, $purple-300, 75%);
-
-$green-300: #00671A; // Element
-$green-200: mix(white, $green-300, 25%);
-$green-100: mix(white, $green-300, 50%);
-$green-000: mix(white, $green-300, 75%);
-
-$yellow-300: #FFDF00; // Kan-Banana
-$yellow-200: mix(white, $yellow-300, 25%);
-$yellow-100: mix(white, $yellow-300, 50%);
-$yellow-000: mix(white, $yellow-300, 75%);
-
-$red-300: #BD145A; // Ruby
-$red-200: mix(white, $red-300, 25%);
-$red-100: mix(white, $red-300, 50%);
-$red-000: mix(white, $red-300, 75%);
-
-$blue-lt-300: #0000FF; // Cascade
-$blue-lt-200: mix(white, $blue-lt-300, 25%);
-$blue-lt-100: mix(white, $blue-lt-300, 50%);
-$blue-lt-000: mix(white, $blue-lt-300, 75%);
-
-/*
-Other, unused brand colors
-
-Float #2797F4
-Firewall #0FF006B
-Hyper Pink #F261A1
-Cluster #ED20EB
-Back End #808080
-Python #25EE5C
-Warm Node #FEA501
-*/
-
-$body-background-color: $white;
-$sidebar-color: $grey-lt-000;
-$code-background-color: $grey-lt-000;
-
-$body-text-color: $grey-dk-200;
-$body-heading-color: $grey-dk-300;
-$nav-child-link-color: $grey-dk-200;
-$link-color: mix(black, $blue-lt-300, 37.5%);
-$btn-primary-color: $purple-300;
-$base-button-color: $grey-lt-000;
-
-// $border-color: $grey-dk-200;
-// $search-result-preview-color: $grey-dk-000;
-// $search-background-color: $grey-dk-250;
-// $table-background-color: $grey-dk-250;
-// $feedback-color: darken($sidebar-color, 3%);
diff --git a/_sass/custom/custom.scss b/_sass/custom/custom.scss
index 3a9dcc5e6d..b3ee3c3775 100755
--- a/_sass/custom/custom.scss
+++ b/_sass/custom/custom.scss
@@ -1039,14 +1039,25 @@ body {
display: flex;
align-items: flex-start;
justify-content: center;
- gap: 20px;
- margin: 0 auto;
+ gap: 0;
+ border-top: 1px solid #eeebee;
+ flex-direction: column;
+ @include mq(md) {
+ flex-direction: row;
+ gap: 20px
+ }
}
.search-page--sidebar {
- flex: 1;
- max-width: 200px;
- flex: 0 0 200px;
+ max-width: 100%;
+ order: 2;
+ margin-top: 1rem;
+ color: $grey-dk-300;
+ @include mq(md) {
+ flex: 1;
+ max-width: 200px;
+ margin-top: 3rem;
+ }
}
.search-page--sidebar--category-filter--checkbox-child {
@@ -1054,52 +1065,96 @@ body {
}
.search-page--results {
- flex: 3;
display: flex;
flex-direction: column;
align-items: center;
- max-width: 60%;
+ width: 100%;
+ max-width: 100%;
+ order: 3;
+ @include mq(md) {
+ flex: 3;
+ max-width: 60%;
+ }
}
-.search-page--results--input {
- width: 100%;
+.search-page--results--wrapper {
position: relative;
+ display: flex;
+ width: 100%;
+ background-color: white;
+ margin: 0 auto 2rem;
+ max-width: 800px;
}
.search-page--results--input-box {
width: 100%;
- padding: 10px;
- margin-bottom: 20px;
- border: 1px solid #ccc;
+ padding: 10px 40px 10px 10px;
+ border: 1px solid $grey-lt-300;
border-radius: 4px;
+ color: $grey-dk-300;
}
.search-page--results--input-icon {
position: absolute;
- top: 35%;
- right: 10px;
- transform: translateY(-50%);
+ right: 12px;
+ align-self: center;
pointer-events: none;
- color: #333;
+ color: $grey-dk-000;
}
-.search-page--results--diplay {
+.search-page--results--display {
width: 100%;
position: relative;
flex-flow: column nowrap;
+ margin-top: 1rem;
+ @media (max-width: $content-width) {
+ margin-top: 0.5rem;
+ }
}
-.search-page--results--diplay--header {
+.search-page--results--display--header {
text-align: center;
- margin-bottom: 20px;
background-color: transparent;
+ color: $grey-dk-300;
+ margin-bottom: 1rem;
+ margin-top: 1.5rem;
+ padding-bottom: 1rem;
+ border-bottom: 1px solid $blue-dk-100;
+ font-size: 20px;
+ @include mq(md) {
+ font-size: 1.5rem;
+ }
}
-.search-page--results--diplay--container--item {
- margin-bottom: 1%;
+.search-page--results--display--container--item {
+ margin-bottom: 2rem;
display: block;
}
+.search-page--results--no-results {
+ padding: 1rem;
+ display: block;
+ font-size: 1rem;
+ font-weight: normal;
+}
+
+.search-page--results--display--container--item--link {
+ font-family: "Open Sans Condensed", Impact, "Franklin Gothic Bold", sans-serif;
+ font-size: 1.2rem;
+ font-weight: bold;
+ display: block;
+ text-decoration: underline;
+ text-underline-offset: 5px;
+ text-decoration-color: $grey-lt-300;
+ &:hover {
+ text-decoration-color: $blue-100;
+ }
+}
+
+.category-checkbox {
+ margin-right: 4px;
+}
+
@mixin body-text($color: #000) {
color: $color;
font-family: 'Open Sans';
diff --git a/_search-plugins/collapse-search.md b/_search-plugins/collapse-search.md
new file mode 100644
index 0000000000..ec7e57515a
--- /dev/null
+++ b/_search-plugins/collapse-search.md
@@ -0,0 +1,231 @@
+---
+layout: default
+title: Collapse search results
+nav_order: 3
+---
+
+# Collapse search results
+
+The `collapse` parameter groups search results by a particular field value. This returns only the top document within each group, which helps reduce redundancy by eliminating duplicates.
+
+The `collapse` parameter requires the field being collapsed to be of either a `keyword` or a `numeric` type.
+
+---
+
+## Collapsing search results
+
+To populate an index with data, define the index mappings and an `item` field indexed as a `keyword`. The following example request shows you how to define index mappings, populate an index, and then search it.
+
+#### Define index mappings
+
+```json
+PUT /bakery-items
+{
+ "mappings": {
+ "properties": {
+ "item": {
+ "type": "keyword"
+ },
+ "category": {
+ "type": "keyword"
+ },
+ "price": {
+ "type": "float"
+ },
+ "baked_date": {
+ "type": "date"
+ }
+ }
+ }
+}
+```
+
+#### Populate an index
+
+```json
+POST /bakery-items/_bulk
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 15, "baked_date": "2023-07-01T00:00:00Z" }
+{ "index": {} }
+{ "item": "Chocolate Cake", "category": "cakes", "price": 18, "baked_date": "2023-07-04T00:00:00Z" }
+{ "index": {} }
+{ "item": "Vanilla Cake", "category": "cakes", "price": 12, "baked_date": "2023-07-02T00:00:00Z" }
+```
+
+#### Search the index, returning all results
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "sort": ["price"]
+}
+```
+
+This query returns the uncollapsed search results, showing all documents, including both entries for "Chocolate Cake".
+
+#### Search the index and collapse the results
+
+To group search results by the `item` field and sort them by `price`, you can use the following query:
+
+**Collapsed `item` field search results**
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item"
+ },
+ "sort": ["price"]
+}
+```
+
+**Response**
+
+```json
+{
+ "took": 3,
+ "timed_out": false,
+ "_shards": {
+ "total": 1,
+ "successful": 1,
+ "skipped": 0,
+ "failed": 0
+ },
+ "hits": {
+ "total": {
+ "value": 4,
+ "relation": "eq"
+ },
+ "max_score": null,
+ "hits": [
+ {
+ "_index": "bakery-items",
+ "_id": "mISga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Vanilla Cake",
+ "category": "cakes",
+ "price": 12,
+ "baked_date": "2023-07-02T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Vanilla Cake"
+ ]
+ },
+ "sort": [
+ 12
+ ]
+ },
+ {
+ "_index": "bakery-items",
+ "_id": "loSga5EB2HLDXHkv9kAr",
+ "_score": null,
+ "_source": {
+ "item": "Chocolate Cake",
+ "category": "cakes",
+ "price": 15,
+ "baked_date": "2023-07-01T00:00:00Z",
+ "baker": "Baker A"
+ },
+ "fields": {
+ "item": [
+ "Chocolate Cake"
+ ]
+ },
+ "sort": [
+ 15
+ ]
+ }
+ ]
+ }
+}
+```
+
+The collapsed search results will show only one "Chocolate Cake" entry, demonstrating how the `collapse` parameter reduces redundancy.
+
+The `collapse` parameter affects only the top search results and does not change any aggregation results. The total number of hits shown in the response reflects all matching documents before the parameter is applied, including duplicates. However, the response doesn't indicate the exact number of unique groups formed by the operation.
+
+---
+
+## Expanding collapsed results
+
+You can expand each collapsed top hit with the `inner_hits` property.
+
+The following example request applies `inner_hits` to retrieve the lowest-priced and most recent item, for each type of cake:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 1,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+```
+
+### Multiple inner hits for each collapsed hit
+
+To obtain several groups of inner hits for each collapsed result, you can set different criteria for each group. For example, lets request the three most recent items for every bakery item:
+
+```json
+GET /bakery-items/_search
+{
+ "query": {
+ "match": {
+ "category": "cakes"
+ }
+ },
+ "collapse": {
+ "field": "item",
+ "inner_hits": [
+ {
+ "name": "cheapest_items",
+ "size": 1,
+ "sort": ["price"]
+ },
+ {
+ "name": "newest_items",
+ "size": 3,
+ "sort": [{ "baked_date": "desc" }]
+ }
+ ]
+ },
+ "sort": ["price"]
+}
+
+
+```
+This query searches for documents in the `cakes` category and groups the search results by the `item_name` field. For each `item_name`, it retrieves the top three lowest-priced items and the top three most recent items, sorted by `baked_date` in descending order.
+
+You can expand the groups by sending an additional query for each inner hit request corresponding to each collapsed hit in the response. This can significantly slow down the process if there are too many groups or inner hit requests. The `max_concurrent_group_searches` request parameter can be used to control the maximum number of concurrent searches allowed in this phase. The default is based on the number of data nodes and the default search thread pool size.
+
diff --git a/_search-plugins/concurrent-segment-search.md b/_search-plugins/concurrent-segment-search.md
index cbbb993ac9..80614e2fff 100644
--- a/_search-plugins/concurrent-segment-search.md
+++ b/_search-plugins/concurrent-segment-search.md
@@ -22,6 +22,8 @@ Without concurrent segment search, Lucene executes a request sequentially across
## Enabling concurrent segment search at the index or cluster level
+Starting with OpenSearch version 2.17, you can use the `search.concurrent_segment_search.mode` setting to configure concurrent segment search on your cluster. The existing `search.concurrent_segment_search.enabled` setting will be deprecated in future version releases in favor of the new setting.
+
By default, concurrent segment search is disabled on the cluster. You can enable concurrent segment search at two levels:
- Cluster level
@@ -30,8 +32,37 @@ By default, concurrent segment search is disabled on the cluster. You can enable
The index-level setting takes priority over the cluster-level setting. Thus, if the cluster setting is enabled but the index setting is disabled, then concurrent segment search will be disabled for that index. Because of this, the index-level setting is not evaluated unless it is explicitly set, regardless of the default value configured for the setting. You can retrieve the current value of the index-level setting by calling the [Index Settings API]({{site.url}}{{site.baseurl}}/api-reference/index-apis/get-settings/) and omitting the `?include_defaults` query parameter.
{: .note}
-To enable concurrent segment search for all indexes in the cluster, set the following dynamic cluster setting:
+Both the cluster- and index-level `search.concurrent_segment_search.mode` settings accept the following values:
+
+- `all`: Enables concurrent segment search across all search requests. This is equivalent to setting `search.concurrent_segment_search.enabled` to `true`.
+
+- `none`: Disables concurrent segment search for all search requests, effectively turning off the feature. This is equivalent to setting `search.concurrent_segment_search.enabled` to `false`. This is the **default** behavior.
+
+- `auto`: In this mode, OpenSearch will use the pluggable _concurrent search decider_ to decide whether to use a concurrent or sequential path for the search request based on the query evaluation and the presence of aggregations in the request. By default, if there are no deciders configured by any plugin, then the decision to use concurrent search will be made based on the presence of aggregations in the request. For more information about the pluggable decider semantics, see [Pluggable concurrent search deciders](#pluggable-concurrent-search-deciders-concurrentsearchrequestdecider).
+
+To enable concurrent segment search for all search requests across every index in the cluster, send the following request:
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.mode": "all"
+ }
+}
+```
+{% include copy-curl.html %}
+
+To enable concurrent segment search for all search requests on a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.mode": "all"
+}
+```
+{% include copy-curl.html %}
+
+You can continue to use the existing `search.concurrent_segment_search.enabled` setting to enable concurrent segment search for all indexes in the cluster as follows:
```json
PUT _cluster/settings
{
@@ -52,6 +83,35 @@ PUT /_settings
```
{% include copy-curl.html %}
+
+When evaluating whether concurrent segment search is enabled on a cluster, the `search.concurrent_segment_search.mode` setting takes precedence over the `search.concurrent_segment_search.enabled` setting.
+If the `search.concurrent_segment_search.mode` setting is not explicitly set, then the `search.concurrent_segment_search.enabled` setting will be evaluated to determine whether to enable concurrent segment search.
+
+When upgrading a cluster from an earlier version that specifies the older `search.concurrent_segment_search.enabled` setting, this setting will continue to be honored. However, once the `search.concurrent_segment_search.mode` is set, it will override the previous setting, enabling or disabling concurrent search based on the specified mode.
+We recommend setting `search.concurrent_segment_search.enabled` to `null` on your cluster once you configure `search.concurrent_segment_search.mode`:
+
+```json
+PUT _cluster/settings
+{
+ "persistent":{
+ "search.concurrent_segment_search.enabled": null
+ }
+}
+```
+{% include copy-curl.html %}
+
+To disable the old setting for a particular index, specify the index name in the endpoint:
+```json
+PUT /_settings
+{
+ "index.search.concurrent_segment_search.enabled": null
+}
+```
+{% include copy-curl.html %}
+
+
+
+
## Slicing mechanisms
You can choose one of two available mechanisms for assigning segments to slices: the default [Lucene mechanism](#the-lucene-mechanism) or the [max slice count mechanism](#the-max-slice-count-mechanism).
@@ -66,7 +126,10 @@ The _max slice count_ mechanism is an alternative slicing mechanism that uses a
### Setting the slicing mechanism
-By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request. To use the max slice count mechanism instead, configure the `search.concurrent.max_slice_count` cluster setting:
+By default, concurrent segment search uses the Lucene mechanism to calculate the number of slices for each shard-level request.
+To use the max slice count mechanism instead, you can set the slice count for concurrent segment search at either the cluster level or index level.
+
+To configure the slice count for all indexes in a cluster, use the following dynamic cluster setting:
```json
PUT _cluster/settings
@@ -78,7 +141,17 @@ PUT _cluster/settings
```
{% include copy-curl.html %}
-The `search.concurrent.max_slice_count` setting can take the following valid values:
+To configure the slice count for a particular index, specify the index name in the endpoint:
+
+```json
+PUT /_settings
+{
+ "index.search.concurrent.max_slice_count": 2
+}
+```
+{% include copy-curl.html %}
+
+Both the cluster- and index-level `search.concurrent.max_slice_count` settings can take the following valid values:
- `0`: Use the default Lucene mechanism.
- Positive integer: Use the max target slice count mechanism. Usually, a value between 2 and 8 should be sufficient.
@@ -117,8 +190,20 @@ Non-concurrent search calculates the document count error and returns it in the
For more information about how `shard_size` can affect both `doc_count_error_upper_bound` and collected buckets, see [this GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/11680#issuecomment-1885882985).
-## Developer information: AggregatorFactory changes
+## Developer information
+
+The following sections provide additional information for developers.
+
+### AggregatorFactory changes
+
+Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+
+To ensure that a custom plugin-based `Aggregator` implementation functions with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L123) method to return `true`.
+
+### Pluggable concurrent search deciders: ConcurrentSearchRequestDecider
-Because of implementation details, not all aggregator types can support concurrent segment search. To accommodate this, we have introduced a [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method in the `AggregatorFactory` class to indicate whether a given aggregation type supports concurrent segment search. By default, this method returns `false`. Any aggregator that needs to support concurrent segment search must override this method in its own factory implementation.
+Introduced 2.17
+{: .label .label-purple }
-To ensure that a custom plugin-based `Aggregator` implementation works with the concurrent search path, plugin developers can verify their implementation with concurrent search enabled and then update the plugin to override the [`supportsConcurrentSegmentSearch()`](https://github.com/opensearch-project/OpenSearch/blob/bb38ed4836496ac70258c2472668325a012ea3ed/server/src/main/java/org/opensearch/search/aggregations/AggregatorFactory.java#L121) method to return `true`.
+Plugin developers can customize the concurrent search decision-making for `auto` mode by extending [`ConcurrentSearchRequestDecider`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java) and registering its factory through [`SearchPlugin#getConcurrentSearchRequestFactories()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/plugins/SearchPlugin.java#L148). The deciders are evaluated only if a request does not belong to any category listed in the [Limitations](#limitations) and [Other considerations](#other-considerations) sections. For more information about the decider implementation, see [the corresponding GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/15259).
+The search request is parsed using a `QueryBuilderVisitor`, which calls the [`ConcurrentSearchRequestDecider#evaluateForQuery()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L36) method of all the configured deciders for every node of the `QueryBuilder` tree in the search request. The final concurrent search decision is obtained by combining the decision from each decider returned by the [`ConcurrentSearchRequestDecider#getConcurrentSearchDecision()`](https://github.com/opensearch-project/OpenSearch/blob/2.x/server/src/main/java/org/opensearch/search/deciders/ConcurrentSearchRequestDecider.java#L44) method.
\ No newline at end of file
diff --git a/_search-plugins/knn/api.md b/_search-plugins/knn/api.md
index c7314f7ae2..d927bf1c35 100644
--- a/_search-plugins/knn/api.md
+++ b/_search-plugins/knn/api.md
@@ -185,7 +185,7 @@ This API operation only works with indexes created using the `nmslib` and `faiss
The following request evicts the native library indexes of three indexes from the cache:
```json
-GET /_plugins/_knn/clear_cache/index1,index2,index3?pretty
+POST /_plugins/_knn/clear_cache/index1,index2,index3?pretty
{
"_shards" : {
"total" : 6,
@@ -200,7 +200,7 @@ The `total` parameter indicates the number of shards that the API attempted to c
The k-NN clear cache API can be used with index patterns to clear one or more indexes that match the given pattern from the cache, as shown in the following example:
```json
-GET /_plugins/_knn/clear_cache/index*?pretty
+POST /_plugins/_knn/clear_cache/index*?pretty
{
"_shards" : {
"total" : 6,
@@ -234,7 +234,7 @@ Response field | Description
`timestamp` | The date and time when the model was created.
`description` | A user-provided description of the model.
`error` | An error message explaining why the model is in a failed state.
-`space_type` | The space type for which this model is trained, for example, Euclidean or cosine.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note - this value can be set in the top-level of the request as well
`dimension` | The dimensionality of the vector space for which this model is designed.
`engine` | The native library used to create the model, either `faiss` or `nmslib`.
@@ -351,6 +351,7 @@ Request parameter | Description
`search_size` | The training data is pulled from the training index using scroll queries. This parameter defines the number of results to return per scroll query. Default is `10000`. Optional.
`description` | A user-provided description of the model. Optional.
`method` | The configuration of the approximate k-NN method used for search operations. For more information about the available methods, see [k-NN index method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). The method requires training to be valid.
+`space_type` | The space type for which this model is trained, for example, Euclidean or cosine. Note: This value can also be set in the `method` parameter.
#### Usage
@@ -365,10 +366,10 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
@@ -395,10 +396,10 @@ POST /_plugins/_knn/models/_train?preference={node_id}
"max_training_vector_count": 1200,
"search_size": 100,
"description": "My model",
+ "space_type": "l2",
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist":128,
"encoder":{
diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md
index e9cff8562f..f8921033e0 100644
--- a/_search-plugins/knn/approximate-knn.md
+++ b/_search-plugins/knn/approximate-knn.md
@@ -49,9 +49,9 @@ PUT my-knn-index-1
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -62,9 +62,9 @@ PUT my-knn-index-1
"my_vector2": {
"type": "knn_vector",
"dimension": 4,
+ "space_type": "innerproduct",
"method": {
"name": "hnsw",
- "space_type": "innerproduct",
"engine": "faiss",
"parameters": {
"ef_construction": 256,
@@ -199,10 +199,10 @@ POST /_plugins/_knn/models/my-model/_train
"training_field": "train-field",
"dimension": 4,
"description": "My model description",
+ "space_type": "l2",
"method": {
"name": "ivf",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"nlist": 4,
"nprobes": 2
@@ -308,6 +308,72 @@ Engine | Notes
:--- | :---
`faiss` | If `nprobes` is present in a query, it overrides the value provided when creating the index.
+### Rescoring quantized results using full precision
+
+Quantization can be used to significantly reduce the memory footprint of a k-NN index. For more information about quantization, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization). Because some vector representation is lost during quantization, the computed distances will be approximate. This causes the overall recall of the search to decrease.
+
+To improve recall while maintaining the memory savings of quantization, you can use a two-phase search approach. In the first phase, `oversample_factor * k` results are retrieved from an index using quantized vectors and the scores are approximated. In the second phase, the full-precision vectors of those `oversample_factor * k` results are loaded into memory from disk, and scores are recomputed against the full-precision query vector. The results are then reduced to the top k.
+
+The default rescoring behavior is determined by the `mode` and `compression_level` of the backing k-NN vector field:
+
+- For `in_memory` mode, no rescoring is applied by default.
+- For `on_disk` mode, default rescoring is based on the configured `compression_level`. Each `compression_level` provides a default `oversample_factor`, specified in the following table.
+
+| Compression level | Default rescore `oversample_factor` |
+|:------------------|:----------------------------------|
+| `32x` (default) | 3.0 |
+| `16x` | 2.0 |
+| `8x` | 2.0 |
+| `4x` | No default rescoring |
+| `2x` | No default rescoring |
+
+To explicitly apply rescoring, provide the `rescore` parameter in a query on a quantized index and specify the `oversample_factor`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : {
+ "oversample_factor": 1.2
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Alternatively, set the `rescore` parameter to `true` to use a default `oversample_factor` of `1.0`:
+
+```json
+GET my-knn-index-1/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "target-field": {
+ "vector": [2, 3, 5, 6],
+ "k": 2,
+ "rescore" : true
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `oversample_factor` is a floating-point number between 1.0 and 100.0, inclusive. The number of results in the first pass is calculated as `oversample_factor * k` and is guaranteed to be between 100 and 10,000, inclusive. If the calculated number of results is smaller than 100, then the number of results is set to 100. If the calculated number of results is greater than 10,000, then the number of results is set to 10,000.
+
+Rescoring is only supported for the `faiss` engine.
+
+Rescoring is not needed if quantization is not used because the scores returned are already fully precise.
+{: .note}
+
### Using approximate k-NN with filters
To learn about using filters with k-NN search, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/).
@@ -322,7 +388,7 @@ To learn more about the radial search feature, see [k-NN radial search]({{site.u
### Using approximate k-NN with binary vectors
-To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To learn more about using binary vectors with k-NN search, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## Spaces
@@ -346,5 +412,5 @@ The cosine similarity formula does not include the `1 -` prefix. However, becaus
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ...]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/disk-based-vector-search.md b/_search-plugins/knn/disk-based-vector-search.md
new file mode 100644
index 0000000000..dfb9262db5
--- /dev/null
+++ b/_search-plugins/knn/disk-based-vector-search.md
@@ -0,0 +1,193 @@
+---
+layout: default
+title: Disk-based vector search
+nav_order: 16
+parent: k-NN search
+has_children: false
+---
+
+# Disk-based vector search
+**Introduced 2.17**
+{: .label .label-purple}
+
+For low-memory environments, OpenSearch provides _disk-based vector search_, which significantly reduces the operational costs for vector workloads. Disk-based vector search uses [binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization), compressing vectors and thereby reducing the memory requirements. This memory optimization provides large memory savings at the cost of slightly increased search latency while still maintaining strong recall.
+
+To use disk-based vector search, set the [`mode`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes) parameter to `on_disk` for your vector field type. This parameter will configure your index to use secondary storage.
+
+## Creating an index for disk-based vector search
+
+To create an index for disk-based vector search, send the following request:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+By default, the `on_disk` mode configures the index to use the `faiss` engine and `hnsw` method. The default [`compression_level`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels) of `32x` reduces the amount of memory the vectors require by a factor of 32. To preserve the search recall, rescoring is enabled by default. A search on a disk-optimized index runs in two phases: The compressed index is searched first, and then the results are rescored using full-precision vectors loaded from disk.
+
+To reduce the compression level, provide the `compression_level` parameter when creating the index mapping:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+For more information about the `compression_level` parameter, see [Compression levels]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#compression-levels). Note that for `4x` compression, the `lucene` engine will be used.
+{: .note}
+
+If you need more granular fine-tuning, you can override additional k-NN parameters in the method definition. For example, to improve recall, increase the `ef_construction` parameter value:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "innerproduct",
+ "data_type": "float",
+ "mode": "on_disk",
+ "method": {
+ "params": {
+ "ef_construction": 512
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The `on_disk` mode only works with the `float` data type.
+{: .note}
+
+## Ingestion
+
+You can perform document ingestion for a disk-optimized vector index in the same way as for a regular vector index. To index several documents in bulk, send the following request:
+
+```json
+POST _bulk
+{ "index": { "_index": "my-vector-index", "_id": "1" } }
+{ "my_vector_field": [1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5], "price": 12.2 }
+{ "index": { "_index": "my-vector-index", "_id": "2" } }
+{ "my_vector_field": [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5], "price": 7.1 }
+{ "index": { "_index": "my-vector-index", "_id": "3" } }
+{ "my_vector_field": [3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5], "price": 12.9 }
+{ "index": { "_index": "my-vector-index", "_id": "4" } }
+{ "my_vector_field": [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5], "price": 1.2 }
+{ "index": { "_index": "my-vector-index", "_id": "5" } }
+{ "my_vector_field": [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5], "price": 3.7 }
+{ "index": { "_index": "my-vector-index", "_id": "6" } }
+{ "my_vector_field": [6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5, 6.5], "price": 10.3 }
+{ "index": { "_index": "my-vector-index", "_id": "7" } }
+{ "my_vector_field": [7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5], "price": 5.5 }
+{ "index": { "_index": "my-vector-index", "_id": "8" } }
+{ "my_vector_field": [8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5, 8.5], "price": 4.4 }
+{ "index": { "_index": "my-vector-index", "_id": "9" } }
+{ "my_vector_field": [9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5, 9.5], "price": 8.9 }
+```
+{% include copy-curl.html %}
+
+## Search
+
+Search is also performed in the same way as in other index configurations. The key difference is that, by default, the `oversample_factor` of the rescore parameter is set to `3.0` (unless you override the `compression_level`). For more information, see [Rescoring quantized results using full precision]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). To perform vector search on a disk-optimized index, provide the search vector:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+Similarly to other index configurations, you can override k-NN parameters in the search request:
+
+```json
+GET my-vector-index/_search
+{
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5],
+ "k": 5,
+ "method_parameters": {
+ "ef_search": 512
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+[Radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/) does not support disk-based vector search.
+{: .note}
+
+## Model-based indexes
+
+For [model-based indexes]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model), you can specify the `on_disk` parameter in the training request in the same way that you would specify it during index creation. By default, `on_disk` mode will use the [Faiss IVF method]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#supported-faiss-methods) and a compression level of `32x`. To run the training API, send the following request:
+
+```json
+POST /_plugins/_knn/models/test-model/_train
+{
+ "training_index": "train-index-name",
+ "training_field": "train-field-name",
+ "dimension": 8,
+ "max_training_vector_count": 1200,
+ "search_size": 100,
+ "description": "My model",
+ "space_type": "innerproduct",
+ "mode": "on_disk"
+}
+```
+{% include copy-curl.html %}
+
+This command assumes that training data has been ingested into the `train-index-name` index. For more information, see [Building a k-NN index from a model]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model).
+{: .note}
+
+You can override the `compression_level` for disk-optimized indexes in the same way as for regular k-NN indexes.
+
+
+## Next steps
+
+- For more information about binary quantization, see [Binary quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#binary-quantization).
+- For more information about k-NN vector workload modes, see [Vector workload modes]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#vector-workload-modes).
\ No newline at end of file
diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md
index a6ffd922eb..620b262cf9 100644
--- a/_search-plugins/knn/knn-index.md
+++ b/_search-plugins/knn/knn-index.md
@@ -25,9 +25,9 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 128,
@@ -41,17 +41,17 @@ PUT /test-index
```
{% include copy-curl.html %}
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the `lucene` engine to reduce the amount of storage space needed. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with k-NN plugin version 2.17, you can use `byte` vectors with the `faiss` and `lucene` engines to reduce the amount of required memory and storage space. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
-## Binary vector
+## Binary vectors
-Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+Starting with k-NN plugin version 2.16, you can use `binary` vectors with the `faiss` engine to reduce the amount of required storage space. For more information, see [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
## SIMD optimization for the Faiss engine
-Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency.
+Starting with version 2.13, the k-NN plugin supports [Single Instruction Multiple Data (SIMD)](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) processing if the underlying hardware supports SIMD instructions (AVX2 on x64 architecture and Neon on ARM64 architecture). SIMD is supported by default on Linux machines only for the Faiss engine. SIMD architecture helps boost overall performance by improving indexing throughput and reducing search latency. Starting with version 2.18, the k-NN plugin supports AVX512 SIMD instructions on x64 architecture.
SIMD optimization is applicable only if the vector dimension is a multiple of 8.
{: .note}
@@ -60,14 +60,22 @@ SIMD optimization is applicable only if the vector dimension is a multiple of 8.
### x64 architecture
-For the x64 architecture, two different versions of the Faiss library are built and shipped with the artifact:
+For x64 architecture, the following versions of the Faiss library are built and shipped with the artifact:
- `libopensearchknn_faiss.so`: The non-optimized Faiss library without SIMD instructions.
-- `libopensearchknn_faiss_avx2.so`: The Faiss library that contains AVX2 SIMD instructions.
+- `libopensearchknn_faiss_avx512.so`: The Faiss library containing AVX512 SIMD instructions.
+- `libopensearchknn_faiss_avx2.so`: The Faiss library containing AVX2 SIMD instructions.
-If your hardware supports AVX2, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+When using the Faiss library, the performance ranking is as follows: AVX512 > AVX2 > no optimization.
+{: .note }
+
+If your hardware supports AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx512.so` library at runtime.
+
+If your hardware supports AVX2 but doesn't support AVX512, the k-NN plugin loads the `libopensearchknn_faiss_avx2.so` library at runtime.
+
+To disable the AVX512 and AVX2 SIMD instructions and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx512.disabled` and `knn.faiss.avx2.disabled` static settings as `true` in `opensearch.yml` (by default, both of these are `false`).
-To disable AVX2 and load the non-optimized Faiss library (`libopensearchknn_faiss.so`), specify the `knn.faiss.avx2.disabled` static setting as `true` in `opensearch.yml` (default is `false`). Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
+Note that to update a static setting, you must stop the cluster, change the setting, and restart the cluster. For more information, see [Static settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/#static-settings).
### ARM64 architecture
@@ -83,7 +91,7 @@ A method definition will always contain the name of the method, the space_type t
Mapping parameter | Required | Default | Updatable | Description
:--- | :--- | :--- | :--- | :---
`name` | true | n/a | false | The identifier for the nearest neighbor method.
-`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors.
+`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. Note: This value can also be specified at the top level of the mapping.
`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene.
`parameters` | false | null | false | The parameters used for the nearest neighbor method.
@@ -116,7 +124,7 @@ Method name | Requires training | Supported spaces | Description
For hnsw, "innerproduct" is not available when PQ is used.
{: .note}
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
#### HNSW parameters
@@ -168,7 +176,6 @@ An index created in OpenSearch version 2.11 or earlier will still use the old `e
"method": {
"name":"hnsw",
"engine":"lucene",
- "space_type": "l2",
"parameters":{
"m":2048,
"ef_construction": 245
@@ -186,7 +193,6 @@ The following example method definition specifies the `hnsw` method and a `pq` e
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -232,7 +238,6 @@ The following example uses the `ivf` method without specifying an encoder (by d
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"nlist": 4,
"nprobes": 2
@@ -246,7 +251,6 @@ The following example uses the `ivf` method with a `pq` encoder:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder":{
"name":"pq",
@@ -265,7 +269,6 @@ The following example uses the `hnsw` method without specifying an encoder (by d
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"ef_construction": 256,
"m": 8
@@ -279,7 +282,6 @@ The following example uses the `hnsw` method with an `sq` encoder of type `fp16`
"method": {
"name":"hnsw",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -300,7 +302,6 @@ The following example uses the `ivf` method with an `sq` encoder of type `fp16`:
"method": {
"name":"ivf",
"engine":"faiss",
- "space_type": "l2",
"parameters":{
"encoder": {
"name": "sq",
@@ -324,7 +325,7 @@ If you want to use less memory and increase indexing speed as compared to HNSW w
If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. Because PQ is a lossy encoding, query quality will drop.
-You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
+You can reduce the memory footprint by a factor of 2, with a minimal loss in search quality, by using the [`fp_16` encoder]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/#faiss-16-bit-scalar-quantization). If your vector dimensions are within the [-128, 127] byte range, we recommend using the [byte quantizer]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors) to reduce the memory footprint by a factor of 4. To learn more about vector quantization options, see [k-NN vector quantization]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-vector-quantization/).
### Memory estimation
diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md
index d2fd883e74..a184de2d3d 100644
--- a/_search-plugins/knn/knn-score-script.md
+++ b/_search-plugins/knn/knn-score-script.md
@@ -302,5 +302,5 @@ Cosine similarity returns a number between -1 and 1, and because OpenSearch rele
With cosine similarity, it is not valid to pass a zero vector (`[0, 0, ... ]`) as input. This is because the magnitude of such a vector is 0, which raises a `divide by 0` exception in the corresponding formula. Requests containing the zero vector will be rejected, and a corresponding exception will be thrown.
{: .note }
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
diff --git a/_search-plugins/knn/knn-vector-quantization.md b/_search-plugins/knn/knn-vector-quantization.md
index 656ce72fd2..a911dc91c9 100644
--- a/_search-plugins/knn/knn-vector-quantization.md
+++ b/_search-plugins/knn/knn-vector-quantization.md
@@ -11,15 +11,15 @@ has_math: true
By default, the k-NN plugin supports the indexing and querying of vectors of type `float`, where each dimension of the vector occupies 4 bytes of memory. For use cases that require ingestion on a large scale, keeping `float` vectors can be expensive because OpenSearch needs to construct, load, save, and search graphs (for native `nmslib` and `faiss` engines). To reduce the memory footprint, you can use vector quantization.
-OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, and product quantization (PQ).
+OpenSearch supports many varieties of quantization. In general, the level of quantization will provide a trade-off between the accuracy of the nearest neighbor search and the size of the memory footprint consumed by the vector search. The supported types include byte vectors, 16-bit scalar quantization, product quantization (PQ), and binary quantization(BQ).
-## Lucene byte vector
+## Byte vectors
-Starting with k-NN plugin version 2.9, you can use `byte` vectors with the Lucene engine in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector).
+Starting with version 2.17, the k-NN plugin supports `byte` vectors with the `faiss` and `lucene` engines in order to reduce the amount of required memory. This requires quantizing the vectors outside of OpenSearch before ingesting them into an OpenSearch index. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors).
## Lucene scalar quantization
-Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike the [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector), which requires you to quantize vectors before ingesting the documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
+Starting with version 2.16, the k-NN plugin supports built-in scalar quantization for the Lucene engine. Unlike [byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors), which require you to quantize vectors before ingesting documents, the Lucene scalar quantizer quantizes input vectors in OpenSearch during ingestion. The Lucene scalar quantizer converts 32-bit floating-point input vectors into 7-bit integer vectors in each segment using the minimum and maximum quantiles computed based on the [`confidence_interval`](#confidence-interval) parameter. During search, the query vector is quantized in each segment using the segment's minimum and maximum quantiles in order to compute the distance between the query vector and the segment's quantized input vectors.
Quantization can decrease the memory footprint by a factor of 4 in exchange for some loss in recall. Additionally, quantization slightly increases disk usage because it requires storing both the raw input vectors and the quantized vectors.
@@ -40,10 +40,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -85,10 +85,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "lucene",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -115,7 +115,7 @@ In the ideal scenario, 7-bit vectors created by the Lucene scalar quantizer use
#### HNSW memory estimation
-The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * M)` bytes/vector, where `M` is the maximum number of bidirectional links created for each element during the construction of the graph.
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
@@ -150,10 +150,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq"
@@ -194,10 +194,10 @@ PUT /test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
"engine": "faiss",
- "space_type": "l2",
"parameters": {
"encoder": {
"name": "sq",
@@ -250,9 +250,9 @@ In the best-case scenario, 16-bit vectors produced by the Faiss SQfp16 quantizer
#### HNSW memory estimation
-The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * M)` bytes/vector.
+The memory required for Hierarchical Navigable Small Worlds (HNSW) is estimated to be `1.1 * (2 * dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
-As an example, assume that you have 1 million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The memory requirement can be estimated as follows:
```r
1.1 * (2 * 256 + 8 * 16) * 1,000,000 ~= 0.656 GB
@@ -260,9 +260,9 @@ As an example, assume that you have 1 million vectors with a dimension of 256 an
#### IVF memory estimation
-The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * d))` bytes/vector.
+The memory required for IVF is estimated to be `1.1 * (((2 * dimension) * num_vectors) + (4 * nlist * dimension))` bytes/vector, where `nlist` is the number of buckets to partition vectors into.
-As an example, assume that you have 1 million vectors with a dimension of 256 and `nlist` of 128. The memory requirement can be estimated as follows:
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `nlist` of 128. The memory requirement can be estimated as follows:
```r
1.1 * (((2 * 256) * 1,000,000) + (4 * 128 * 256)) ~= 0.525 GB
@@ -310,3 +310,175 @@ For example, assume that you have 1 million vectors with a dimension of 256, `iv
```r
1.1*((8 / 8 * 64 + 24) * 1000000 + 100 * (2^8 * 4 * 256 + 4 * 512 * 256)) ~= 0.171 GB
```
+
+## Binary quantization
+
+Starting with version 2.17, OpenSearch supports BQ with binary vector support for the Faiss engine. BQ compresses vectors into a binary format (0s and 1s), making it highly efficient in terms of memory usage. You can choose to represent each vector dimension using 1, 2, or 4 bits, depending on the desired precision. One of the advantages of using BQ is that the training process is handled automatically during indexing. This means that no separate training step is required, unlike other quantization techniques such as PQ.
+
+### Using BQ
+To configure BQ for the Faiss engine, define a `knn_vector` field and specify the `mode` as `on_disk`. This configuration defaults to 1-bit BQ and both `ef_search` and `ef_construction` set to `100`:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk"
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+To further optimize the configuration, you can specify additional parameters, such as the compression level, and fine-tune the search parameters. For example, you can override the `ef_construction` value or define the compression level, which corresponds to the number of bits used for quantization:
+
+- **32x compression** for 1-bit quantization
+- **16x compression** for 2-bit quantization
+- **8x compression** for 4-bit quantization
+
+This allows for greater control over memory usage and recall performance, providing flexibility to balance between precision and storage efficiency.
+
+To specify the compression level, set the `compression_level` parameter:
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "space_type": "l2",
+ "data_type": "float",
+ "mode": "on_disk",
+ "compression_level": "16x",
+ "method": {
+ "params": {
+ "ef_construction": 16
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+The following example further fine-tunes the configuration by defining `ef_construction`, `encoder`, and the number of `bits` (which can be `1`, `2`, or `4`):
+
+```json
+PUT my-vector-index
+{
+ "mappings": {
+ "properties": {
+ "my_vector_field": {
+ "type": "knn_vector",
+ "dimension": 8,
+ "method": {
+ "name": "hnsw",
+ "engine": "faiss",
+ "space_type": "l2",
+ "params": {
+ "m": 16,
+ "ef_construction": 512,
+ "encoder": {
+ "name": "binary",
+ "parameters": {
+ "bits": 1
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+### Search using binary quantized vectors
+
+You can perform a k-NN search on your index by providing a vector and specifying the number of nearest neighbors (k) to return:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+You can also fine-tune search by providing the `ef_search` and `oversample_factor` parameters.
+The `oversample_factor` parameter controls the factor by which the search oversamples the candidate vectors before ranking them. Using a higher oversample factor means that more candidates will be considered before ranking, improving accuracy but also increasing search time. When selecting the `oversample_factor` value, consider the trade-off between accuracy and efficiency. For example, setting the `oversample_factor` to `2.0` will double the number of candidates considered during the ranking phase, which may help achieve better results.
+
+The following request specifies the `ef_search` and `oversample_factor` parameters:
+
+```json
+GET my-vector-index/_search
+{
+ "size": 2,
+ "query": {
+ "knn": {
+ "my_vector_field": {
+ "vector": [1.5, 5.5, 1.5, 5.5, 1.5, 5.5, 1.5, 5.5],
+ "k": 10,
+ "method_parameters": {
+ "ef_search": 10
+ },
+ "rescore": {
+ "oversample_factor": 10.0
+ }
+ }
+ }
+ }
+}
+```
+{% include copy-curl.html %}
+
+
+#### HNSW memory estimation
+
+The memory required for the Hierarchical Navigable Small World (HNSW) graph can be estimated as `1.1 * (dimension + 8 * m)` bytes/vector, where `m` is the maximum number of bidirectional links created for each element during the construction of the graph.
+
+As an example, assume that you have 1 million vectors with a dimension of 256 and an `m` of 16. The following sections provide memory requirement estimations for various compression values.
+
+##### 1-bit quantization (32x compression)
+
+In 1-bit quantization, each dimension is represented using 1 bit, equivalent to a 32x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 1 / 8) + 8 * 16) * 1,000,000
+ ~= 0.176 GB
+```
+
+##### 2-bit quantization (16x compression)
+
+In 2-bit quantization, each dimension is represented using 2 bits, equivalent to a 16x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 2 / 8) + 8 * 16) * 1,000,000
+ ~= 0.211 GB
+```
+
+##### 4-bit quantization (8x compression)
+
+In 4-bit quantization, each dimension is represented using 4 bits, equivalent to an 8x compression factor. The memory requirement can be estimated as follows:
+
+```r
+Memory = 1.1 * ((256 * 4 / 8) + 8 * 16) * 1,000,000
+ ~= 0.282 GB
+```
diff --git a/_search-plugins/knn/nested-search-knn.md b/_search-plugins/knn/nested-search-knn.md
index d947ebc6e6..bbba6c9c1e 100644
--- a/_search-plugins/knn/nested-search-knn.md
+++ b/_search-plugins/knn/nested-search-knn.md
@@ -38,9 +38,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
@@ -324,9 +324,9 @@ PUT my-knn-index-1
"my_vector": {
"type": "knn_vector",
"dimension": 3,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md
index cc27776fc4..7a8d9fec7b 100644
--- a/_search-plugins/knn/painless-functions.md
+++ b/_search-plugins/knn/painless-functions.md
@@ -55,7 +55,7 @@ l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This functi
cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:
`float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)`
In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score.
hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance.
-The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
{: .note}
## Constraints
diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md
index 123b1daef1..77f44dee93 100644
--- a/_search-plugins/knn/performance-tuning.md
+++ b/_search-plugins/knn/performance-tuning.md
@@ -59,9 +59,9 @@ The `_source` field contains the original JSON document body that was passed at
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
@@ -85,9 +85,9 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss"
}
}
diff --git a/_search-plugins/knn/radial-search-knn.md b/_search-plugins/knn/radial-search-knn.md
index 1a4a223294..e5449a0993 100644
--- a/_search-plugins/knn/radial-search-knn.md
+++ b/_search-plugins/knn/radial-search-knn.md
@@ -53,9 +53,9 @@ PUT knn-index-test
"my_vector": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "faiss",
"parameters": {
"ef_construction": 100,
diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md
index 1b9aa3608c..e4731ec94c 100644
--- a/_search-plugins/knn/settings.md
+++ b/_search-plugins/knn/settings.md
@@ -27,6 +27,7 @@ Setting | Static/Dynamic | Default | Description
`knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability.
`knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap.
`knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
+`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine).
## Index settings
diff --git a/_search-plugins/search-pipelines/using-search-pipeline.md b/_search-plugins/search-pipelines/using-search-pipeline.md
index ecb988ad11..b6dbbdc5d0 100644
--- a/_search-plugins/search-pipelines/using-search-pipeline.md
+++ b/_search-plugins/search-pipelines/using-search-pipeline.md
@@ -17,14 +17,45 @@ You can use a search pipeline in the following ways:
## Specifying an existing search pipeline for a request
-After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query by specifying the pipeline name in the `search_pipeline` query parameter:
+After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query in the following ways. For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+
+### Specifying the pipeline in a query parameter
+
+You can specify the pipeline name in the `search_pipeline` query parameter as follows:
```json
GET /my_index/_search?search_pipeline=my_pipeline
```
{% include copy-curl.html %}
-For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example).
+### Specifying the pipeline in the request body
+
+You can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /my-index/_search
+{
+ "query": {
+ "match_all": {}
+ },
+ "from": 0,
+ "size": 10,
+ "search_pipeline": "my_pipeline"
+}
+```
+{% include copy-curl.html %}
+
+For multi-search, you can provide a search pipeline ID in the search request body as follows:
+
+```json
+GET /_msearch
+{ "index": "test"}
+{ "query": { "match_all": {} }, "from": 0, "size": 10, "search_pipeline": "my_pipeline"}
+{ "index": "test-1", "search_type": "dfs_query_then_fetch"}
+{ "query": { "match_all": {} }, "search_pipeline": "my_pipeline1" }
+
+```
+{% include copy-curl.html %}
## Using a temporary search pipeline for a request
diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md
index 395e9e748a..38fc7a491d 100644
--- a/_search-plugins/searching-data/inner-hits.md
+++ b/_search-plugins/searching-data/inner-hits.md
@@ -139,8 +139,8 @@ The preceding query searches for nested user objects containing the name John an
}
}
```
-## Inner hits with parent-child objects
-Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent-child objects.
+## Inner hits with parent/child objects
+Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent/child objects.
1. Create an index with a parent-join field:
@@ -806,4 +806,8 @@ The following is the expected result:
Using `inner_hits` provides contextual relevance by showing exactly which nested or child documents match the query criteria. This is crucial for applications in which the relevance of results depends on a specific part of the document that matches the query.
- Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
\ No newline at end of file
+ Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search.
+
+## Next steps
+
+- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields.
\ No newline at end of file
diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md
index cd893f4144..f19030bf90 100644
--- a/_search-plugins/vector-search.md
+++ b/_search-plugins/vector-search.md
@@ -37,9 +37,9 @@ PUT test-index
"my_vector1": {
"type": "knn_vector",
"dimension": 1024,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "nmslib",
"parameters": {
"ef_construction": 128,
@@ -57,7 +57,7 @@ PUT test-index
You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float.
-To save storage space, you can use `byte` or `binary` vectors. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector) and [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors).
+To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors).
### k-NN vector search
@@ -131,9 +131,9 @@ PUT /hotels-index
"location": {
"type": "knn_vector",
"dimension": 2,
+ "space_type": "l2",
"method": {
"name": "hnsw",
- "space_type": "l2",
"engine": "lucene",
"parameters": {
"ef_construction": 100,
diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md
index 366bc2674c..b26063bed0 100644
--- a/_security-analytics/threat-intelligence/getting-started.md
+++ b/_security-analytics/threat-intelligence/getting-started.md
@@ -50,15 +50,64 @@ Local files uploaded as the threat intelligence source must use the following sp
When using the `S3_SOURCE` as a remote store, the following connection information must be provided:
-- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role.
-- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored.
-- **Specify a directory or file**: The object key or directory path for the `STIX2` file in the S3 bucket.
+- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. When using the AWS OpenSearch Service, the role ARN needs to be in the same account as the OpenSearch domain. For more information about adding a new role for the AWS OpenSearch Service, see [Add service ARN](#add-aws-opensearch-service-arn).
+- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. To access an S3 bucket in a different AWS account, see the [Cross-account S3 bucket connection](#cross-account-s3-bucket-connection) section for more details.
+- **Specify a file**: The object key for the `STIX2` file in the S3 bucket.
- **Region**: The AWS Region for the S3 bucket.
You can also set the **Download schedule**, which determines to where OpenSearch downloads an updated `STIX2` file from the connected S3 bucket. The default interval is once a day. Only daily intervals are supported.
Alternatively, you can check the **Download on demand** option, which prevents new data from the bucket from being automatically downloaded.
+#### Add AWS OpenSearch Service ARN
+
+If you're using the AWS OpenSearch Service, create a new ARN role with a custom trust policy. For instructions on how to create the role, see [Creating a role for an AWS service](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-service.html#roles-creatingrole-service-console).
+
+When creating the role, customize the following settings:
+
+- Add the following custom trust policy:
+
+ ```bash
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "Service": [
+ "opensearchservice.amazonaws.com"
+ ]
+ },
+ "Action": "sts:AssumeRole"
+ }
+ ]
+ }
+ ```
+
+- On the Permissions policies page, add the `AmazonS3ReadOnlyAccess` permission.
+
+
+#### Cross-account S3 bucket connection
+
+Because the role ARN needs to be in the same account as the OpenSearch domain, a trust policy needs to be configured that allows the OpenSearch domain to download from S3 buckets from the same account.
+
+To download from an S3 bucket in another account, the trust policy for that bucket needs to give the role ARN permission to read from the object, as shown in the following example:
+
+```
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {
+ "AWS": "arn:aws:iam::123456789012:role/account-1-threat-intel-role"
+ },
+ "Action": "s3:*",
+ "Resource": "arn:aws:s3:::account-2-threat-intel-bucket/*"
+ }
+ ]
+}
+```
## Step 2: Set up scanning for your log sources
diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md
index 352fe06a61..b17b60e147 100644
--- a/_security/access-control/document-level-security.md
+++ b/_security/access-control/document-level-security.md
@@ -13,6 +13,8 @@ Document-level security lets you restrict a role to a subset of documents in an
![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png)
+The maximum size for the document-level security configuration is 1024 KB (1,048,404 characters).
+{: .warning}
## Simple roles
diff --git a/_security/audit-logs/index.md b/_security/audit-logs/index.md
index becb001ec0..8eeea33447 100644
--- a/_security/audit-logs/index.md
+++ b/_security/audit-logs/index.md
@@ -224,3 +224,36 @@ plugins.security.audit.config.threadpool.max_queue_len: 100000
To disable audit logs after they've been enabled, remove the `plugins.security.audit.type: internal_opensearch` setting from `opensearch.yml`, or switch off the **Enable audit logging** check box in OpenSearch Dashboards.
+## Audit user account manipulation
+
+To enable audit logging on changes to a security index, such as changes to roles mappings and role creation or deletion, use the following settings in the `compliance:` portion of the audit log configuration, as shown in the following example:
+
+```
+_meta:
+ type: "audit"
+ config_version: 2
+
+config:
+ # enable/disable audit logging
+ enabled: true
+
+ ...
+
+
+ compliance:
+ # enable/disable compliance
+ enabled: true
+
+ # Log updates to internal security changes
+ internal_config: true
+
+ # Log only metadata of the document for write events
+ write_metadata_only: false
+
+ # Log only diffs for document updates
+ write_log_diffs: true
+
+ # List of indices to watch for write events. Wildcard patterns are supported
+ # write_watched_indices: ["twitter", "logs-*"]
+ write_watched_indices: [".opendistro_security"]
+```
diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md
index 3f28dfecfd..6c7311e7dc 100644
--- a/_security/authentication-backends/jwt.md
+++ b/_security/authentication-backends/jwt.md
@@ -117,7 +117,7 @@ The following table lists the configuration parameters.
Name | Description
:--- | :---
-`signing_key` | The signing key to use when verifying the token. If you use a symmetric key algorithm, it is the base64-encoded shared secret. If you use an asymmetric algorithm, it contains the public key.
+`signing_key` | The signing key(s) used to verify the token. If you use a symmetric key algorithm, this is the Base64-encoded shared secret. If you use an asymmetric algorithm, the algorithm contains the public key. To pass multiple keys, use a comma-separated list or enumerate the keys.
`jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer `. Default is `Authorization`. Replacing this field with a value other than `Authorization` prevents the audit log from properly redacting the JWT header from audit messages. It is recommended that users only use `Authorization` when using JWTs with audit logging.
`jwt_url_parameter` | If the token is not transmitted in the HTTP header but rather as an URL parameter, define the name of the parameter here.
`subject_key` | The key in the JSON payload that stores the username. If not set, the [subject](https://tools.ietf.org/html/rfc7519#section-4.1.2) registered claim is used.
diff --git a/_security/configuration/disable-enable-security.md b/_security/configuration/disable-enable-security.md
index 811fd2a69f..38bcc01cdd 100755
--- a/_security/configuration/disable-enable-security.md
+++ b/_security/configuration/disable-enable-security.md
@@ -155,22 +155,22 @@ Use the following steps to reinstall the plugin:
1. Disable shard allocation and stop all nodes so that shards don't move when the cluster is restarted:
- ```json
- curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
- "transient": {
- "cluster.routing.allocation.enable": "none"
- }
- }'
- ```
- {% include copy.html %}
+ ```json
+ curl -XPUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d '{
+ "transient": {
+ "cluster.routing.allocation.enable": "none"
+ }
+ }'
+ ```
+ {% include copy.html %}
2. Install the Security plugin on all nodes in your cluster using one of the [installation methods]({{site.url}}{{site.baseurl}}/install-and-configure/plugins/#install):
- ```bash
- bin/opensearch-plugin install opensearch-security
- ```
- {% include copy.html %}
-
+ ```bash
+ bin/opensearch-plugin install opensearch-security
+ ```
+ {% include copy.html %}
+
3. Add the necessary configuration to `opensearch.yml` for TLS encryption. See
[Configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/) for information about the settings that need to be configured.
diff --git a/_security/configuration/index.md b/_security/configuration/index.md
index e351e8865f..f68667d92d 100644
--- a/_security/configuration/index.md
+++ b/_security/configuration/index.md
@@ -3,7 +3,7 @@ layout: default
title: Configuration
nav_order: 2
has_children: true
-has_toc: false
+has_toc: true
redirect_from:
- /security-plugin/configuration/
- /security-plugin/configuration/index/
@@ -11,21 +11,105 @@ redirect_from:
# Security configuration
-The plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch in a production environment, you must configure it manually:
+The Security plugin includes demo certificates so that you can get up and running quickly. To use OpenSearch with the Security plugin in a production environment, you must make changes to the demo certificates and other configuration options manually.
-1. [Replace the demo certificates]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
-1. [Reconfigure `opensearch.yml` to use your certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls).
-1. [Reconfigure `config.yml` to use your authentication backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/) (if you don't plan to use the internal user database).
-1. [Modify the configuration YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml).
-1. If you plan to use the internal user database, [set a password policy in `opensearch.yml`]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#opensearchyml).
-1. [Apply changes using the `securityadmin` script]({{site.url}}{{site.baseurl}}/security/configuration/security-admin).
-1. Start OpenSearch.
-1. [Add users, roles, role mappings, and tenants]({{site.url}}{{site.baseurl}}/security/access-control/index/).
+## Replace the demo certificates
-If you don't want to use the plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+OpenSearch ships with demo certificates intended for quick setup and demonstration purposes. For a production environment, it's critical to replace these with your own trusted certificates, using the following steps, to ensure secure communication:
-The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that use kibana in their names. We will change these names in a future release.
+1. **Generate your own certificates:** Use tools like OpenSSL or a certificate authority (CA) to generate your own certificates. For more information about generating certificates with OpenSSL, see [Generating self-signed certificates]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/).
+2. **Store the generated certificates and private key in the appropriate directory:** Generated certificates are typically stored in `/config/`. For more information, see [Add certificate files to opensearch.yml]({{site.url}}{{site.baseurl}}/security/configuration/generate-certificates/#add-certificate-files-to-opensearchyml).
+3. **Set the following file permissions:**
+ - Private key (.key files): Set the file mode to `600`. This restricts access so that only the file owner (the OpenSearch user) can read and write to the file, ensuring that the private key remains secure and inaccessible to unauthorized users.
+ - Public certificates (.crt, .pem files): Set the file mode to `644`. This allows the file owner to read and write to the file, while other users can only read it.
+
+For additional guidance on file modes, see the following table.
+
+ | Item | Sample | Numeric | Bitwise |
+ |-------------|---------------------|---------|--------------|
+ | Public key | `~/.ssh/id_rsa.pub` | `644` | `-rw-r--r--` |
+ | Private key | `~/.ssh/id_rsa` | `600` | `-rw-------` |
+ | SSH folder | `~/.ssh` | `700` | `drwx------` |
+
+For more information, see [Configuring basic security settings]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/docker/#configuring-basic-security-settings).
+
+## Reconfigure `opensearch.yml` to use your certificates
+
+The `opensearch.yml` file is the main configuration file for OpenSearch; you can find the file at `/config/opensearch.yml`. Use the following steps to update this file to point to your custom certificates:
+
+In `opensearch.yml`, set the correct paths for your certificates and keys, as shown in the following example:
+ ```
+ plugins.security.ssl.transport.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.transport.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.transport.pemtrustedcas_filepath: /path/to/your/ca.pem
+ plugins.security.ssl.http.enabled: true
+ plugins.security.ssl.http.pemcert_filepath: /path/to/your/cert.pem
+ plugins.security.ssl.http.pemkey_filepath: /path/to/your/key.pem
+ plugins.security.ssl.http.pemtrustedcas_filepath: /path/to/your/ca.pem
+ ```
+For more information, see [Configuring TLS certificates]({{site.url}}{{site.baseurl}}/security/configuration/tls/).
+
+## Reconfigure `config.yml` to use your authentication backend
+
+The `config.yml` file allows you to configure the authentication and authorization mechanisms for OpenSearch. Update the authentication backend settings in `/config/opensearch-security/config.yml` according to your requirements.
+
+For example, to use LDAP as your authentication backend, add the following settings:
+
+ ```
+ authc:
+ basic_internal_auth:
+ http_enabled: true
+ transport_enabled: true
+ order: 1
+ http_authenticator:
+ type: basic
+ challenge: true
+ authentication_backend:
+ type: internal
+ ```
+For more information, see [Configuring the Security backend]({{site.url}}{{site.baseurl}}/security/configuration/configuration/).
+
+## Modify the configuration YAML files
+
+Determine whether any additional YAML files need modification, for example, the `roles.yml`, `roles_mapping.yml`, or `internal_users.yml` files. Update the files with any additional configuration information. For more information, see [Modifying the YAML files]({{site.url}}{{site.baseurl}}/security/configuration/yaml/).
+
+## Set a password policy
+
+When using the internal user database, we recommend enforcing a password policy to ensure that strong passwords are used. For information about strong password policies, see [Password settings]({{site.url}}{{site.baseurl}}/security/configuration/yaml/#password-settings).
+
+## Apply changes using the `securityadmin` script
+
+The following steps do not apply to first-time users because the security index is automatically initialized from the YAML configuration files when OpenSearch starts.
+{: .note}
+
+After initial setup, if you make changes to your security configuration or disable automatic initialization by setting `plugins.security.allow_default_init_securityindex` to `false` (which prevents security index initialization from `yaml` files), you need to manually apply changes using the `securityadmin` script:
+
+1. Find the `securityadmin` script. The script is typically stored in the OpenSearch plugins directory, `plugins/opensearch-security/tools/securityadmin.[sh|bat]`.
+ - Note: If you're using OpenSearch 1.x, the `securityadmin` script is located in the `plugins/opendistro_security/tools/` directory.
+ - For more information, see [Basic usage](https://opensearch.org/docs/latest/security/configuration/security-admin/#basic-usage).
+2. Run the script by using the following command:
+ ```
+ ./plugins/opensearch-security/tools/securityadmin.[sh|bat]
+ ```
+3. Check the OpenSearch logs and configuration to ensure that the changes have been successfully applied.
+
+For more information about using the `securityadmin` script, see [Applying changes to configuration files]({{site.url}}{{site.baseurl}}/security/configuration/security-admin/).
+
+## Add users, roles, role mappings, and tenants
+
+If you don't want to use the Security plugin, you can disable it by adding the following setting to the `opensearch.yml` file:
+
+```
+plugins.security.disabled: true
+```
+
+You can then enable the plugin by removing the `plugins.security.disabled` setting.
+
+For more information about disabling the Security plugin, see [Disable security]({{site.url}}{{site.baseurl}}/security/configuration/disable-enable-security/).
+
+The Security plugin has several default users, roles, action groups, permissions, and settings for OpenSearch Dashboards that contain "Kibana" in their names. We will change these names in a future version.
{: .note }
-For a full list of `opensearch.yml` Security plugin settings, Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
+For a full list of `opensearch.yml` Security plugin settings, see [Security settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/security-settings/).
{: .note}
+
diff --git a/_security/configuration/yaml.md b/_security/configuration/yaml.md
index 1686c8332e..2694e3a24f 100644
--- a/_security/configuration/yaml.md
+++ b/_security/configuration/yaml.md
@@ -265,7 +265,7 @@ kibana_server:
## roles.yml
-This file contains any initial roles that you want to add to the Security plugin. Aside from some metadata, the default file is empty, because the Security plugin has a number of static roles that it adds automatically.
+This file contains any initial roles that you want to add to the Security plugin. By default, this file contains predefined roles that grant usage to plugins within the default distribution of OpenSearch. The Security plugin will also add a number static roles automatically.
```yml
---
diff --git a/_tools/index.md b/_tools/index.md
index 108f10da97..c9d446a81a 100644
--- a/_tools/index.md
+++ b/_tools/index.md
@@ -18,6 +18,7 @@ This section provides documentation for OpenSearch-supported tools, including:
- [OpenSearch CLI](#opensearch-cli)
- [OpenSearch Kubernetes operator](#opensearch-kubernetes-operator)
- [OpenSearch upgrade, migration, and comparison tools](#opensearch-upgrade-migration-and-comparison-tools)
+- [Sycamore](#sycamore) for AI-powered extract, transform, load (ETL) on complex documents for vector and hybrid search
For information about Data Prepper, the server-side data collector for filtering, enriching, transforming, normalizing, and aggregating data for downstream analytics and visualization, see [Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/index/).
@@ -122,3 +123,9 @@ The OpenSearch Kubernetes Operator is an open-source Kubernetes operator that he
OpenSearch migration tools facilitate migrations to OpenSearch and upgrades to newer versions of OpenSearch. These can help you can set up a proof-of-concept environment locally using Docker containers or deploy to AWS using a one-click deployment script. This empowers you to fine-tune cluster configurations and manage workloads more effectively before migration.
For more information about OpenSearch migration tools, see the documentation in the [OpenSearch Migration GitHub repository](https://github.com/opensearch-project/opensearch-migrations/tree/capture-and-replay-v0.1.0).
+
+## Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using an [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+For more information, see [Sycamore]({{site.url}}{{site.baseurl}}/tools/sycamore/).
diff --git a/_tools/sycamore.md b/_tools/sycamore.md
new file mode 100644
index 0000000000..9b3986dbf3
--- /dev/null
+++ b/_tools/sycamore.md
@@ -0,0 +1,48 @@
+---
+layout: default
+title: Sycamore
+nav_order: 210
+has_children: false
+---
+
+# Sycamore
+
+[Sycamore](https://github.com/aryn-ai/sycamore) is an open-source, AI-powered document processing engine designed to prepare unstructured data for retrieval-augmented generation (RAG) and semantic search using Python. Sycamore supports chunking and enriching a wide range of complex document types, including reports, presentations, transcripts, and manuals. Additionally, Sycamore can extract and process embedded elements, such as tables, figures, graphs, and other infographics. It can then load the data into target indexes, including vector and keyword indexes, using a connector like the [OpenSearch connector](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html).
+
+To get started, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
+
+## Sycamore ETL pipeline structure
+
+A Sycamore extract, transform, load (ETL) pipeline applies a series of transformations to a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets), which is a collection of documents and their constituent elements (for example, tables, blocks of text, or headers). At the end of the pipeline, the DocSet is loaded into OpenSearch vector and keyword indexes.
+
+A typical pipeline for preparing unstructured data for vector or hybrid search in OpenSearch consists of the following steps:
+
+* Read documents into a [DocSet](https://sycamore.readthedocs.io/en/stable/sycamore/get_started/concepts.html#docsets).
+* [Partition documents](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/partition.html) into structured JSON elements.
+* Extract metadata and filter and clean data using [transforms](https://sycamore.readthedocs.io/en/stable/sycamore/APIs/docset.html).
+* Create [chunks](https://sycamore.readthedocs.io/en/stable/sycamore/transforms/merge.html) from groups of elements.
+* Embed the chunks using the model of your choice.
+* [Load](https://sycamore.readthedocs.io/en/stable/sycamore/connectors/opensearch.html) the embeddings, metadata, and text into OpenSearch vector and keyword indexes.
+
+For an example pipeline that uses this workflow, see [this notebook](https://github.com/aryn-ai/sycamore/blob/main/notebooks/opensearch_docs_etl.ipynb).
+
+
+## Install Sycamore
+
+We recommend installing the Sycamore library using `pip`. The connector for OpenSearch can be specified and installed using extras. For example:
+
+```bash
+pip install sycamore-ai[opensearch]
+```
+{% include copy.html %}
+
+By default, Sycamore works with the Aryn Partitioning Service to process PDFs. To run inference locally for partitioning or embedding, install Sycamore with the `local-inference` extra as follows:
+
+```bash
+pip install sycamore-ai[opensearch,local-inference]
+```
+{% include copy.html %}
+
+## Next steps
+
+For more information, visit the [Sycamore documentation](https://sycamore.readthedocs.io/en/stable/sycamore/get_started.html).
diff --git a/_troubleshoot/tls.md b/_troubleshoot/tls.md
index 93e9a2c490..6c777ad5b8 100644
--- a/_troubleshoot/tls.md
+++ b/_troubleshoot/tls.md
@@ -207,7 +207,7 @@ plugins.security.ssl.http.enabled_protocols:
TLS relies on the server and client negotiating a common cipher suite. Depending on your system, the available ciphers will vary. They depend on the JDK or OpenSSL version you're using, and whether or not the `JCE Unlimited Strength Jurisdiction Policy Files` are installed.
-For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html). If you don't have them installed, you might see an error message on startup:
+For legal reasons, the JDK does not include strong ciphers like AES256. In order to use strong ciphers you need to download and install the [Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction Policy Files](https://www.oracle.com/java/technologies/javase-jce8-downloads.html). If you don't have them installed, you might see an error message on startup:
```
[INFO ] AES-256 not supported, max key length for AES is 128 bit.
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
index d967aca914..03cd1716f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/remote-cluster-state.md
@@ -67,10 +67,14 @@ The remote cluster state functionality has the following limitations:
## Remote cluster state publication
-
The cluster manager node processes updates to the cluster state. It then publishes the updated cluster state through the local transport layer to all of the follower nodes. With the `remote_store.publication` feature enabled, the cluster state is backed up to the remote store during every state update. The follower nodes can then fetch the state from the remote store directly, which reduces the overhead on the cluster manager node for publication.
-To enable the feature flag for the `remote_store.publication` feature, follow the steps in the [experimental feature flag documentation]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/experimental/).
+To enable this feature, configure the following setting in `opensearch.yml`:
+
+```yml
+# Enable Remote cluster state publication
+cluster.remote_store.publication.enabled: true
+```
Enabling the setting does not change the publication flow, and follower nodes will not send acknowledgements back to the cluster manager node
until they download the updated cluster state from the remote store.
@@ -89,8 +93,11 @@ You do not have to use different remote store repositories for state and routing
To configure remote publication, use the following cluster settings.
-Setting | Default | Description
-:--- | :--- | :---
-`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for remote state download to complete on the follower node.
-`cluster.remote_store.routing_table.path_type` | HASHED_PREFIX | The path type to be used for creating an index routing path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
-`cluster.remote_store.routing_table.path_hash_algo` | FNV_1A_BASE64 | The algorithm to be used for constructing the prefix or infix of the blob store path. This setting is applied if `cluster.remote_store.routing_table.path_type` is `hashed_prefix` or `hashed_infix`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+Setting | Default | Description
+:--- |:---| :---
+`cluster.remote_store.state.read_timeout` | 20s | The amount of time to wait for the remote state download to complete on the follower node.
+`cluster.remote_store.state.path.prefix` | "" (Empty string) | The fixed prefix to add to the index metadata files in the blob store.
+`cluster.remote_store.index_metadata.path_type` | `HASHED_PREFIX` | The path type used for creating an index metadata path in the blob store. Valid values are `FIXED`, `HASHED_PREFIX`, and `HASHED_INFIX`.
+`cluster.remote_store.index_metadata.path_hash_algo` | `FNV_1A_BASE64 ` | The algorithm that constructs the prefix or infix for the index metadata path in the blob store. This setting is applied if the ``cluster.remote_store.index_metadata.path_type` setting is `HASHED_PREFIX` or `HASHED_INFIX`. Valid algorithm values are `FNV_1A_BASE64` and `FNV_1A_COMPOSITE_1`.
+`cluster.remote_store.routing_table.path.prefix` | "" (Empty string) | The fixed prefix to add for the index routing files in the blob store.
+
diff --git a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
index 0415af65f1..e93f504be3 100644
--- a/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
+++ b/_tuning-your-cluster/availability-and-recovery/remote-store/snapshot-interoperability.md
@@ -27,7 +27,7 @@ PUT /_snapshot/snap_repo
```
{% include copy-curl.html %}
-Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. After the setting is enabled, we recommend not disabling the setting. Doing so could affect data durability.
+Once enabled, all requests using the [Snapshot API]({{site.url}}{{site.baseurl}}/api-reference/snapshots/index/) will remain the same for all snapshots. Therefore, do not disable the shallow snapshot setting after it has been enabled because disabling the setting could affect data durability.
## Considerations
@@ -37,3 +37,43 @@ Consider the following before using shallow copy snapshots:
- All nodes in the cluster must use OpenSearch 2.10 or later to take advantage of shallow copy snapshots.
- The `incremental` file count and size between the current snapshot and the last snapshot is `0` when using shallow copy snapshots.
- Searchable snapshots are not supported inside shallow copy snapshots.
+
+## Shallow snapshot v2
+
+Starting with OpenSearch 2.17, the shallow snapshot feature offers an improved version called `shallow snapshot v2`, which aims to makes snapshot operations more efficient and scalable by introducing the following enhancements:
+
+* Deterministic snapshot operations: Shallow snapshot v2 makes snapshot operations more deterministic, ensuring consistent and predictable behavior.
+* Minimized cluster state updates: Shallow snapshot v2 minimizes the number of cluster state updates required during snapshot operations, reducing overhead and improving performance.
+* Scalability: Shallow snapshot v2 allows snapshot operations to scale independently of the number of shards in the cluster, enabling better performance and efficiency for large datasets.
+
+Shallow snapshot v2 must be enabled separately from shallow copies.
+
+### Enabling shallow snapshot v2
+
+To enable shallow snapshot v2, enable the following repository settings:
+
+- `remote_store_index_shallow_copy: true`
+- `shallow_snapshot_v2: true`
+
+The following example request creates a shallow snapshot v2 repository:
+
+```bash
+PUT /_snapshot/snap_repo
+{
+"type": "s3",
+"settings": {
+"bucket": "test-bucket",
+"base_path": "daily-snaps",
+"remote_store_index_shallow_copy": true,
+"shallow_snapshot_v2": true
+}
+}
+```
+{% include copy-curl.html %}
+
+### Limitations
+
+Shallow snapshot v2 has the following limitations:
+
+* Shallow snapshot v2 only supported for remote-backed indexes.
+* All nodes in the cluster must use OpenSearch 2.17 or later to take advantage of shallow snapshot v2.
diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
index b9e35b2697..d13955f3f0 100644
--- a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
+++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md
@@ -18,7 +18,7 @@ The searchable snapshot feature incorporates techniques like caching frequently
To configure the searchable snapshots feature, create a node in your `opensearch.yml file` and define the node role as `search`. Optionally, you can also configure the `cache.size` property for the node.
-A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
+A `search` node reserves storage for the cache to perform searchable snapshot queries. In the case of a dedicated search node where the node exclusively has the `search` role, this value defaults to a fixed percentage (80%) of available storage. In other cases, the value needs to be configured by the user using the `node.search.cache.size` setting.
Parameter | Type | Description
:--- | :--- | :---
diff --git a/_tuning-your-cluster/index.md b/_tuning-your-cluster/index.md
index 99db78565f..fa0973395f 100644
--- a/_tuning-your-cluster/index.md
+++ b/_tuning-your-cluster/index.md
@@ -192,11 +192,27 @@ To better understand and monitor your cluster, use the [CAT API]({{site.url}}{{s
## (Advanced) Step 6: Configure shard allocation awareness or forced awareness
+To further fine-tune your shard allocation, you can set custom node attributes for shard allocation awareness or forced awareness.
+
### Shard allocation awareness
-If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone that’s different from their primary shard.
+You can set custom node attributes on OpenSearch nodes to be used for shard allocation awareness. For example, you can set the `zone` attribute on each node to represent the zone in which the node is located. You can also use the `zone` attribute to ensure that the primary shard and its replica shards are allocated in a balanced manner across available, distinct zones. In this scenario, maximum shard copies per zone would equal `ceil (number_of_shard_copies/number_of_distinct_zones)`.
+
+OpenSearch, by default, allocates shard copies of a single shard across different nodes. When only 1 zone is available, such as after a zone failure, OpenSearch allocates replica shards to the only remaining zone---it considers only available zones (attribute values) when calculating the maximum number of allowed shard copies per zone.
+
+For example, if your index has a total of 5 shard copies (1 primary and 4 replicas) and nodes in 3 distinct zones, then OpenSearch will perform the following to allocate all 5 shard copies:
+
+- Allocate no more than 2 shards per zone, which will require at least 2 nodes in 2 zones.
+- Allocate the last shard in the third zone, with at least 1 node needed in the third zone.
-With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures.
+Alternatively, if you have 3 nodes in the first zone and 1 node in each remaining zone, then OpenSearch will allocate:
+
+- 2 shard copies in the first zone.
+- 1 shard copy in the remaining 2 zones.
+
+The final shard copy will remain unallocated due to the lack of nodes.
+
+With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones, adding a layer of fault tolerance to ensure that your data survives zone failures.
To configure shard allocation awareness, add zone attributes to `opensearch-d1` and `opensearch-d2`, respectively:
@@ -219,6 +235,8 @@ PUT _cluster/settings
}
```
+You can also use multiple attributes for shard allocation awareness by providing the attributes as a comma-separated string, for example, `zone,rack`.
+
You can either use `persistent` or `transient` settings. We recommend the `persistent` setting because it persists through a cluster reboot. Transient settings don't persist through a cluster reboot.
Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone.
diff --git a/_tuning-your-cluster/replication-plugin/auto-follow.md b/_tuning-your-cluster/replication-plugin/auto-follow.md
index 828b835387..92e7a6c144 100644
--- a/_tuning-your-cluster/replication-plugin/auto-follow.md
+++ b/_tuning-your-cluster/replication-plugin/auto-follow.md
@@ -98,9 +98,9 @@ To delete a replication rule, send the following request to the follower cluster
```bash
curl -XDELETE -k -H 'Content-Type: application/json' -u 'admin:' 'https://localhost:9200/_plugins/_replication/_autofollow?pretty' -d '
{
- "leader_alias" : "my-conection-alias",
+ "leader_alias" : "my-connection-alias",
"name": "my-replication-rule"
}'
```
-When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
\ No newline at end of file
+When you delete a replication rule, OpenSearch stops replicating *new* indexes that match the pattern, but existing indexes that the rule previously created remain read-only and continue to replicate. If you need to stop existing replication activity and open the indexes up for writes, use the [stop replication API operation]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication).
diff --git a/assets/examples/ecommerce.json b/assets/examples/ecommerce.ndjson
similarity index 100%
rename from assets/examples/ecommerce.json
rename to assets/examples/ecommerce.ndjson
diff --git a/assets/js/search.js b/assets/js/search.js
index 8d9cab2ec5..86970d9544 100644
--- a/assets/js/search.js
+++ b/assets/js/search.js
@@ -173,7 +173,10 @@
const showNoResults = () => {
emptyResults();
- elResults.appendChild(document.createRange().createContextualFragment('No results found!'));
+ const resultElement = document.createElement('div');
+ resultElement.classList.add('search-page--results--no-results');
+ resultElement.appendChild(document.createRange().createContextualFragment('No results found.'));
+ elResults.appendChild(resultElement);
showResults();
elSpinner?.classList.remove(CLASSNAME_SPINNING);
};
@@ -278,8 +281,6 @@
window.doResultsPageSearch = async (query, type, version) => {
- console.log("Running results page search!");
-
const searchResultsContainer = document.getElementById('searchPageResultsContainer');
try {
@@ -291,7 +292,7 @@ window.doResultsPageSearch = async (query, type, version) => {
if (data.results && data.results.length > 0) {
data.results.forEach(result => {
const resultElement = document.createElement('div');
- resultElement.classList.add('search-page--results--diplay--container--item');
+ resultElement.classList.add('search-page--results--display--container--item');
const contentCite = document.createElement('cite');
const crumbs = [...result.ancestors];
@@ -302,11 +303,9 @@ window.doResultsPageSearch = async (query, type, version) => {
const titleLink = document.createElement('a');
titleLink.href = result.url;
+ titleLink.classList.add('search-page--results--display--container--item--link');
titleLink.textContent = result.title;
- titleLink.style.fontSize = '1.5em';
- titleLink.style.fontWeight = 'bold';
- titleLink.style.display = 'block';
-
+
const contentSpan = document.createElement('span');
contentSpan.textContent = result.content;
contentSpan.style.display = 'block';
@@ -317,16 +316,10 @@ window.doResultsPageSearch = async (query, type, version) => {
// Append the result element to the searchResultsContainer
searchResultsContainer.appendChild(resultElement);
-
- const breakline = document.createElement('hr');
- breakline.style.border = '.5px solid #ccc';
- breakline.style.margin = 'auto';
- searchResultsContainer.appendChild(breakline);
});
} else {
const noResultsElement = document.createElement('div');
noResultsElement.textContent = 'No results found.';
- noResultsElement.style.fontSize = '2em';
searchResultsContainer.appendChild(noResultsElement);
}
} catch (error) {
diff --git a/build.sh b/build.sh
index 060bbfa666..85ef617931 100755
--- a/build.sh
+++ b/build.sh
@@ -1,3 +1,9 @@
#!/usr/bin/env bash
-JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host localhost --port 4000 --incremental --livereload --open-url --trace
+host="localhost"
+
+if [[ "$DOCKER_BUILD" == "true" ]]; then
+ host="0.0.0.0"
+fi
+
+JEKYLL_LINK_CHECKER=internal bundle exec jekyll serve --host ${host} --port 4000 --incremental --livereload --open-url --trace
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000000..04dd007db9
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,14 @@
+version: "3"
+
+services:
+ doc_builder:
+ image: ruby:3.2.4
+ volumes:
+ - .:/app
+ working_dir: /app
+ ports:
+ - "4000:4000"
+ command: bash -c "bundler install && bash build.sh"
+ environment:
+ BUNDLE_PATH: /app/vendor/bundle # Avoid installing gems globally.
+ DOCKER_BUILD: true # Signify build.sh to bind to 0.0.0.0 for effective doc access from host.
diff --git a/release-notes/opensearch-documentation-release-notes-2.17.0.md b/release-notes/opensearch-documentation-release-notes-2.17.0.md
new file mode 100644
index 0000000000..d9ed51737c
--- /dev/null
+++ b/release-notes/opensearch-documentation-release-notes-2.17.0.md
@@ -0,0 +1,36 @@
+# OpenSearch Documentation Website 2.17.0 Release Notes
+
+The OpenSearch 2.17.0 documentation includes the following additions and updates.
+
+## New documentation for 2.17.0
+
+- Get offline batch inference details using task API in m [#8305](https://github.com/opensearch-project/documentation-website/pull/8305)
+- Documentation for Binary Quantization Support with KNN Vector Search [#8281](https://github.com/opensearch-project/documentation-website/pull/8281)
+- add offline batch ingestion tech doc [#8251](https://github.com/opensearch-project/documentation-website/pull/8251)
+- Add documentation changes for disk-based k-NN [#8246](https://github.com/opensearch-project/documentation-website/pull/8246)
+- Derived field updates for 2.17 [#8244](https://github.com/opensearch-project/documentation-website/pull/8244)
+- Add changes for multiple signing keys [#8243](https://github.com/opensearch-project/documentation-website/pull/8243)
+- Add documentation changes for Snapshot Status API [#8235](https://github.com/opensearch-project/documentation-website/pull/8235)
+- Update flow framework additional fields in previous_node_inputs [#8233](https://github.com/opensearch-project/documentation-website/pull/8233)
+- Add documentation changes for shallow snapshot v2 [#8207](https://github.com/opensearch-project/documentation-website/pull/8207)
+- Add documentation for context and ABC templates [#8197](https://github.com/opensearch-project/documentation-website/pull/8197)
+- Create documentation for snapshots with hashed prefix path type [#8196](https://github.com/opensearch-project/documentation-website/pull/8196)
+- Adding documentation for remote index use in AD [#8191](https://github.com/opensearch-project/documentation-website/pull/8191)
+- Doc update for concurrent search [#8181](https://github.com/opensearch-project/documentation-website/pull/8181)
+- Adding new cluster search setting docs [#8180](https://github.com/opensearch-project/documentation-website/pull/8180)
+- Add new settings for remote publication [#8176](https://github.com/opensearch-project/documentation-website/pull/8176)
+- Grouping Top N queries documentation [#8173](https://github.com/opensearch-project/documentation-website/pull/8173)
+- Document reprovision param for Update Workflow API [#8172](https://github.com/opensearch-project/documentation-website/pull/8172)
+- Add documentation for Faiss byte vector [#8170](https://github.com/opensearch-project/documentation-website/pull/8170)
+- Terms query can accept encoded terms input as bitmap [#8133](https://github.com/opensearch-project/documentation-website/pull/8133)
+- Update doc for adding new param in cat shards action for cancellation… [#8127](https://github.com/opensearch-project/documentation-website/pull/8127)
+- Add docs on skip_validating_missing_parameters in ml-commons connector [#8118](https://github.com/opensearch-project/documentation-website/pull/8118)
+- Add Split Response Processor to 2.17 Search Pipeline docs [#8081](https://github.com/opensearch-project/documentation-website/pull/8081)
+- Added documentation for FGAC for Flow Framework [#8076](https://github.com/opensearch-project/documentation-website/pull/8076)
+- Remove composite agg limitations for concurrent search [#7904](https://github.com/opensearch-project/documentation-website/pull/7904)
+- Add doc for nodes stats search.request.took fields [#7887](https://github.com/opensearch-project/documentation-website/pull/7887)
+- Add documentation for ignore_hosts config option for ip-based rate limiting [#7859](https://github.com/opensearch-project/documentation-website/pull/7859)
+
+## Documentation for 2.17.0 experimental features
+
+- Document new experimental ingestion streaming APIs [#8123](https://github.com/opensearch-project/documentation-website/pull/8123)
- `connector.pre_process.cohere.embedding` for [Cohere](https://cohere.com/) embedding models
- `connector.pre_process.openai.embedding` for [OpenAI](https://platform.openai.com/docs/guides/embeddings) embedding models
- `connector.pre_process.default.embedding`, which you can use to preprocess documents in neural search requests so that they are in the format that ML Commons can process with the default preprocessor (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | | `post_process_function` | String | Optional. A built-in or custom Painless script used to post-process the model output data. OpenSearch provides the following built-in post-process functions that you can call directly:
- `connector.pre_process.cohere.embedding` for [Cohere text embedding models](https://docs.cohere.com/reference/embed)
- `connector.pre_process.openai.embedding` for [OpenAI text embedding models](https://platform.openai.com/docs/api-reference/embeddings)
- `connector.post_process.default.embedding`, which you can use to post-process documents in the model response so that they are in the format that neural search expects (OpenSearch 2.11 or later). For more information, see [Built-in functions](#built-in-pre--and-post-processing-functions). | - +| `headers` | JSON object | Specifies the headers used in the request or response body. Default is `ContentType: application/json`. If your third-party ML tool requires access control, define the required `credential` parameters in the `headers` parameter. | The `client_config` parameter supports the following options. diff --git a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md index 7061d3cb5a..c4cc27f660 100644 --- a/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md +++ b/_ml-commons-plugin/tutorials/semantic-search-byte-vectors.md @@ -7,7 +7,7 @@ nav_order: 10 # Semantic search using byte-quantized vectors -This tutorial illustrates how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#lucene-byte-vector). +This tutorial shows you how to build a semantic search using the [Cohere Embed model](https://docs.cohere.com/reference/embed) and byte-quantized vectors. For more information about using byte-quantized vectors, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/#byte-vectors). The Cohere Embed v3 model supports several `embedding_types`. For this tutorial, you'll use the `INT8` type to encode byte-quantized vectors. diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md index bb4f9c6c30..156e985e8b 100644 --- a/_monitoring-your-cluster/pa/index.md +++ b/_monitoring-your-cluster/pa/index.md @@ -60,7 +60,7 @@ private-key-file-path = specify_path The Performance Analyzer plugin is included in the installations for [Docker]({{site.url}}{{site.baseurl}}/opensearch/install/docker/) and [tarball]({{site.url}}{{site.baseurl}}/opensearch/install/tar/), but you can also install the plugin manually. -To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://search.maven.org/search?q=org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster. +To install the Performance Analyzer plugin manually, download the plugin from [Maven](https://central.sonatype.com/namespace/org.opensearch.plugin) and install it using the standard [plugin installation]({{site.url}}{{site.baseurl}}/opensearch/install/plugins/) process. Performance Analyzer runs on each node in a cluster. To start the Performance Analyzer root cause analysis (RCA) agent on a tarball installation, run the following command: diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md index 679237094a..ad6fa5950b 100644 --- a/_observing-your-data/ad/dashboards-anomaly-detection.md +++ b/_observing-your-data/ad/dashboards-anomaly-detection.md @@ -18,12 +18,12 @@ You can connect data visualizations to OpenSearch datasets and then create, run, Before getting started, you must have: - Installed OpenSearch and OpenSearch Dashboards version 2.9 or later. See [Installing OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/). -- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]({{site.url}}{{site.baseurl}}/install-and-configure/plugins). +- Installed the Anomaly Detection plugin version 2.9 or later. See [Installing OpenSearch plugins]/({{site.url}}{{site.baseurl}}/install-and-configure/plugins/). - Installed the Anomaly Detection Dashboards plugin version 2.9 or later. See [Managing OpenSearch Dashboards plugins]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/plugins/) to get started. ## General requirements for anomaly detection visualizations -Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information on real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-set-up-detector-jobs). +Anomaly detection visualizations are displayed as time-series charts that give you a snapshot of when anomalies have occurred from different anomaly detectors you have configured for the visualization. You can display up to 10 metrics on your chart, and each series can be shown as a line on the chart. Note that only real-time anomalies will be visible on the chart. For more information about real-time and historical anomaly detection, see [Anomaly detection, Step 3: Set up detector jobs]({{site.url}}{{site.baseurl}}/observing-your-data/ad/index/#step-3-setting-up-detector-jobs). Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization: diff --git a/_observing-your-data/ad/index.md b/_observing-your-data/ad/index.md index 5dfa1b8f1a..657c3c90cb 100644 --- a/_observing-your-data/ad/index.md +++ b/_observing-your-data/ad/index.md @@ -10,30 +10,42 @@ redirect_from: # Anomaly detection -An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure. +An _anomaly_ in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric can help identify early signs of a system failure. -It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior. +Conventional techniques like visualizations and dashboards can make it difficult to uncover anomalies. Configuring alerts based on static thresholds is possible, but this approach requires prior domain knowledge and may not adapt to data with organic growth or seasonal trends. -Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9). +Anomaly detection automatically detects anomalies in your OpenSearch data in near real time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an _anomaly grade_ and _confidence score_ value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Robust Random Cut Forest Based Anomaly Detection on Streams](https://www.semanticscholar.org/paper/Robust-Random-Cut-Forest-Based-Anomaly-Detection-on-Guha-Mishra/ecb365ef9b67cd5540cc4c53035a6a7bd88678f9). You can pair the Anomaly Detection plugin with the [Alerting plugin]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/) to notify you as soon as an anomaly is detected. +{: .note} -To get started, choose **Anomaly Detection** in OpenSearch Dashboards. -To first test with sample streaming data, you can try out one of the preconfigured detectors with one of the sample datasets. +## Getting started with anomaly detection in OpenSearch Dashboards + +To get started, go to **OpenSearch Dashboards** > **OpenSearch Plugins** > **Anomaly Detection**. ## Step 1: Define a detector -A detector is an individual anomaly detection task. You can define multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources. +A _detector_ is an individual anomaly detection task. You can define multiple detectors, and all detectors can run simultaneously, with each analyzing data from different sources. You can define a detector by following these steps: + +1. On the **Anomaly detection** page, select the **Create detector** button. +2. On the **Define detector** page, enter the required information in the **Detector details** pane. +3. In the **Select data** pane, specify the data source by choosing a source from the **Index** dropdown menu. You can choose an index, index patterns, or an alias. +4. (Optional) Filter the data source by selecting **Add data filter** and then entering the conditions for **Field**, **Operator**, and **Value**. Alternatively, you can choose **Use query DSL** and add your JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). +#### Example: Filtering data using query DSL + +The following example query retrieves documents in which the `urlPath.keyword` field matches any of the specified values: +======= 1. Choose **Create detector**. 1. Add in the detector details. - Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector. 1. Specify the data source. - - For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indexes. + - For **Data source**, choose one or more indexes to use as the data source. Alternatively, you can use an alias or index pattern to choose multiple indexes. + - Detectors can use remote indexes. You can access them using the `cluster-name:index-name` pattern. See [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/) for more information. Alternatively, you can select clusters and indexes in OpenSearch Dashboards 2.17 or later. To learn about configuring remote indexes with the Security plugin enabled, see [Selecting remote indexes with fine-grained access control]({{site.url}}{{site.baseurl}}/observing-your-data/ad/security/#selecting-remote-indexes-with-fine-grained-access-control) in the [Anomaly detection security](observing-your-data/ad/security/) documentation. - (Optional) For **Data filter**, filter the index you chose as the data source. From the **Data filter** menu, choose **Add data filter**, and then design your filter query by selecting **Field**, **Operator**, and **Value**, or choose **Use query DSL** and add your own JSON filter query. Only [Boolean queries]({{site.url}}{{site.baseurl}}/query-dsl/compound/bool/) are supported for query domain-specific language (DSL). -#### Example filter using query DSL -The query is designed to retrieve documents in which the `urlPath.keyword` field matches one of the following specified values: +To create a cross-cluster detector in OpenSearch Dashboards, the following [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) are required: `indices:data/read/field_caps`, `indices:admin/resolve/index`, and `cluster:monitor/remote/info`. +{: .note} - /domain/{id}/short - /sub_dir/{id}/short @@ -62,40 +74,38 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field } } ``` + {% include copy-curl.html %} -1. Specify a timestamp. - - Select the **Timestamp field** in your index. -1. Define operation settings. - - For **Operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data. - - The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model. - The shorter you set this interval, the fewer data points the detector aggregates. - The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals. - - - We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process. +5. In the **Timestamp** pane, select a field from the **Timestamp field** dropdown menu. - - (Optional) To add extra processing time for data collection, specify a **Window delay** value. +6. In the **Operation settings** pane, define the **Detector interval**, which is the interval at which the detector collects data. + - The detector aggregates the data at this interval and then feeds the aggregated result into the anomaly detection model. The shorter the interval, the fewer data points the detector aggregates. The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process requires a certain number of aggregated data points from contiguous intervals. + - You should set the detector interval based on your actual data. If the detector interval is too long, then it might delay the results. If the detector interval is too short, then it might miss some data. The detector interval also will not have a sufficient number of consecutive data points for the shingle process. + - (Optional) To add extra processing time for data collection, specify a **Window delay** value. - This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay. Set the window delay to shift the detector interval to account for this delay. - - For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. -1. Specify custom results index. - - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. To enable this, select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, like `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. + - For example, the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute. Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00. Setting the window delay to 1 minute shifts the interval window to 1:49--1:59, so the detector accounts for all 10 minutes of the detector interval time. + - To avoid missing any data, set the **Window delay** to the upper limit of the expected ingestion delay. This ensures that the detector captures all data during its interval, reducing the risk of missing relevant information. While a longer window delay helps capture all data, too long of a window delay can hinder real-time anomaly detection because the detector will look further back in time. Find a balance to maintain both data accuracy and timely detection. - You can use the dash “-” sign to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the "financial" department at a granular level for the "us" area. +7. Specify a custom results index. + - The Anomaly Detection plugin allows you to store anomaly detection results in a custom index of your choice. Select **Enable custom results index** and provide a name for your index, for example, `abc`. The plugin then creates an alias prefixed with `opensearch-ad-plugin-result-` followed by your chosen name, for example, `opensearch-ad-plugin-result-abc`. This alias points to an actual index with a name containing the date and a sequence number, such as `opensearch-ad-plugin-result-abc-history-2024.06.12-000002`, where your results are stored. + + You can use `-` to separate the namespace to manage custom results index permissions. For example, if you use `opensearch-ad-plugin-result-financial-us-group1` as the results index, you can create a permission role based on the pattern `opensearch-ad-plugin-result-financial-us-*` to represent the `financial` department at a granular level for the `us` group. {: .note } - When the Security plugin (fine-grained access control) is enabled, the default results index becomes a system index and is no longer accessible through the standard Index or Search APIs. To access its content, you must use the Anomaly Detection RESTful API or the dashboard. As a result, you cannot build customized dashboards using the default results index if the Security plugin is enabled. However, you can create a custom results index in order to build customized dashboards. - If the custom index you specify does not exist, the Anomaly Detection plugin will create it when you create the detector and start your real-time or historical analysis. - If the custom index already exists, the plugin will verify that the index mapping matches the required structure for anomaly results. In this case, ensure that the custom index has a valid mapping as defined in the [`anomaly-results.json`](https://github.com/opensearch-project/anomaly-detection/blob/main/src/main/resources/mappings/anomaly-results.json) file. - - To use the custom results index option, you need the following permissions: - - `indices:admin/create` - The Anomaly Detection plugin requires the ability to create and roll over the custom index. - - `indices:admin/aliases` - The Anomaly Detection plugin requires access to create and manage an alias for the custom index. - - `indices:data/write/index` - You need the `write` permission for the Anomaly Detection plugin to write results into the custom index for a single-entity detector. - - `indices:data/read/search` - You need the `search` permission because the Anomaly Detection plugin needs to search custom results indexes to show results on the Anomaly Detection UI. - - `indices:data/write/delete` - Because the detector might generate a large number of anomaly results, you need the `delete` permission to delete old data and save disk space. - - `indices:data/write/bulk*` - You need the `bulk*` permission because the Anomaly Detection plugin uses the bulk API to write results into the custom index. - - Managing the custom results index: - - The anomaly detection dashboard queries all detectors’ results from all custom results indexes. Having too many custom results indexes might impact the performance of the Anomaly Detection plugin. - - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to rollover old results indexes. You can also manually delete or archive any old results indexes. We recommend reusing a custom results index for multiple detectors. - - The Anomaly Detection plugin also provides lifecycle management for custom indexes. It rolls an alias over to a new index when the custom results index meets any of the conditions in the following table. + - To use the custom results index option, you must have the following permissions: + - `indices:admin/create` -- The `create` permission is required in order to create and roll over the custom index. + - `indices:admin/aliases` -- The `aliases` permission is required in order to create and manage an alias for the custom index. + - `indices:data/write/index` -- The `write` permission is required in order to write results into the custom index for a single-entity detector. + - `indices:data/read/search` -- The `search` permission is required in order to search custom results indexes to show results on the Anomaly Detection interface. + - `indices:data/write/delete` -- The detector may generate many anomaly results. The `delete` permission is required in order to delete old data and save disk space. + - `indices:data/write/bulk*` -- The `bulk*` permission is required because the plugin uses the Bulk API to write results into the custom index. + - When managing the custom results index, consider the following: + - The anomaly detection dashboard queries all detector results from all custom results indexes. Having too many custom results indexes can impact the plugin's performance. + - You can use [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) to roll over old results indexes. You can also manually delete or archive any old results indexes. Reusing a custom results index for multiple detectors is recommended. + - The plugin provides lifecycle management for custom indexes. It rolls over an alias to a new index when the custom results index meets any of the conditions in the following table. Parameter | Description | Type | Unit | Example | Required :--- | :--- |:--- |:--- |:--- |:--- @@ -103,43 +113,52 @@ The query is designed to retrieve documents in which the `urlPath.keyword` field `result_index_min_age` | The minimum index age required for rollover, calculated from its creation time to the current time. | `integer` |`day` | `7` | No `result_index_ttl` | The minimum age required to permanently delete rolled-over indexes. | `integer` | `day` | `60` | No -1. Choose **Next**. +8. Choose **Next**. After you define the detector, the next step is to configure the model. ## Step 2: Configure the model -#### Add features to your detector +1. Add features to your detector. -A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. +A _feature_ is any field in your index that you want to analyze for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly. For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature. -A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting. -{: .note } +A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely that multi-feature models will identify smaller anomalies as compared to a single-feature model. Adding more features can negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data can further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. By default, the maximum number of features for a detector is `5`. You can adjust this limit using the `plugins.anomaly_detection.max_anomaly_features` setting. +{: .note} + +### Configuring a model based on an aggregation method To configure an anomaly detection model based on an aggregation method, follow these steps: -1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**. -1. For **Find anomalies based on**, select **Field Value**. -1. For **aggregation method**, select either **average()**, **count()**, **sum()**, **min()**, or **max()**. -1. For **Field**, select from the available options. +1. On the **Detectors** page, select the desired detector from the list. +2. On the detector's details page, select the **Actions** button to activate the dropdown menu and then select **Edit model configuration**. +3. On the **Edit model configuration** page, select the **Add another feature** button. +4. Enter a name in the **Feature name** field and select the **Enable feature** checkbox. +5. Select **Field value** from the dropdown menu under **Find anomalies based on**. +6. Select the desired aggregation from the dropdown menu under **Aggregation method**. +7. Select the desired field from the options listed in the dropdown menu under **Field**. +8. Select the **Save changes** button. + +### Configuring a model based on a JSON aggregation query To configure an anomaly detection model based on a JSON aggregation query, follow these steps: -1. On the **Configure Model** page, enter the **Feature name** and check **Enable feature**. -1. For **Find anomalies based on**, select **Custom expression**. You will see the JSON editor window open up. -1. Enter your JSON aggregation query in the editor. -For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/) -{: .note } +1. On the **Edit model configuration** page, select the **Add another feature** button. +2. Enter a name in the **Feature name** field and select the **Enable feature** checkbox. +3. Select **Custom expression** from the dropdown menu under **Find anomalies based on**. The JSON editor window will open. +4. Enter your JSON aggregation query in the editor. +5. Select the **Save changes** button. -#### (Optional) Set category fields for high cardinality +For acceptable JSON query syntax, see [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/). +{: .note} -You can categorize anomalies based on a keyword or IP field type. +### Setting categorical fields for high cardinality -The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues. +You can categorize anomalies based on a keyword or IP field type. You can enable the **Categorical fields** option to categorize, or "slice," the source time series using a dimension, such as an IP address, a product ID, or a country code. This gives you a granular view of anomalies within each entity of the category field to help isolate and debug issues. -To set a category field, choose **Enable a category field** and select a field. You can’t change the category fields after you create the detector. +To set a category field, choose **Enable categorical fields** and select a field. You cannot change the category fields after you create the detector. Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster: @@ -147,7 +166,7 @@ Only a certain number of unique entities are supported in the category field. Us (data nodes * heap size * anomaly detection maximum memory percentage) / (entity model size of a detector) ``` -To get the entity model size of a detector, use the [profile detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage with the `plugins.anomaly_detection.model_max_size_percent` setting. +To get the detector's entity model size, use the [Profile Detector API]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api/#profile-detector). You can adjust the maximum memory percentage using the `plugins.anomaly_detection.model_max_size_percent` setting. Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the default 10% memory allocation. With an entity model size of 1 MB, the following formula calculates the estimated number of unique entities: @@ -155,81 +174,109 @@ Consider a cluster with 3 data nodes, each with 8 GB of JVM heap size and the de (8096 MB * 0.1 / 1 MB ) * 3 = 2429 ``` -If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), the anomaly detector will attempt to model the extra entities. The detector prioritizes entities that occur more often and are more recent. +If the actual total number of unique entities is higher than the number that you calculate (in this case, 2,429), then the anomaly detector attempts to model the extra entities. The detector prioritizes both entities that occur more often and are more recent. -This formula serves as a starting point. Make sure to test it with a representative workload. You can find more information in the [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) blog post. +This formula serves as a starting point. Make sure to test it with a representative workload. See the OpenSearch blog post [Improving Anomaly Detection: One million entities in one minute](https://opensearch.org/blog/one-million-enitities-in-one-minute/) for more information. {: .note } -#### (Advanced settings) Set a shingle size +### Setting a shingle size -Set the number of aggregation intervals from your data stream to consider in a detection window. It’s best to choose this value based on your actual data to see which one leads to the best results for your use case. +In the **Advanced settings** pane, you can set the number of data stream aggregation intervals to include in the detection window. Choose this value based on your actual data to find the optimal setting for your use case. To set the shingle size, select **Show** in the **Advanced settings** pane. Enter the desired size in the **intervals** field. -The anomaly detector expects the shingle size to be in the range of 1 and 60. The default shingle size is 8. We recommend that you don't choose 1 unless you have two or more features. Smaller values might increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also false positives. Larger values might be useful for ignoring noise in a signal. +The anomaly detector requires the shingle size to be between 1 and 128. The default is `8`. Use `1` only if you have at least two features. Values of less than `8` may increase [recall](https://en.wikipedia.org/wiki/Precision_and_recall) but also may increase false positives. Values greater than `8` may be useful for ignoring noise in a signal. -#### Preview sample anomalies +### Setting an imputation option -Preview sample anomalies and adjust the feature settings if needed. -For sample previews, the Anomaly Detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results. +In the **Advanced settings** pane, you can set the imputation option. This allows you to manage missing data in your streams. The options include the following: -Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results. +- **Ignore Missing Data (Default):** The system continues without considering missing data points, keeping the existing data flow. +- **Fill with Custom Values:** Specify a custom value for each feature to replace missing data points, allowing for targeted imputation tailored to your data. +- **Fill with Zeros:** Replace missing values with zeros. This is ideal when the absence of data indicates a significant event, such as a drop to zero in event counts. +- **Use Previous Values:** Fill gaps with the last observed value to maintain continuity in your time-series data. This method treats missing data as non-anomalous, carrying forward the previous trend. -1. Choose **Preview sample anomalies**. - - If you don't see any sample anomaly result, check the detector interval and make sure you have more than 400 data points for some entities during the preview date range. -1. Choose **Next**. +Using these options can improve recall in anomaly detection. For instance, if you are monitoring for drops in event counts, including both partial and complete drops, then filling missing values with zeros helps detect significant data absences, improving detection recall. + +Be cautious when imputing extensively missing data, as excessive gaps can compromise model accuracy. Quality input is critical---poor data quality leads to poor model performance. The confidence score also decreases when imputations occur. You can check whether a feature value has been imputed using the `feature_imputed` field in the anomaly results index. See [Anomaly result mapping]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/result-mapping/) for more information. +{: note} + +### Suppressing anomalies with threshold-based rules + +In the **Advanced settings** pane, you can suppress anomalies by setting rules that define acceptable differences between the expected and actual values, either as an absolute value or a relative percentage. This helps reduce false anomalies caused by minor fluctuations, allowing you to focus on significant deviations. + +Suppose you want to detect substantial changes in log volume while ignoring small variations that are not meaningful. Without customized settings, the system might generate false alerts for minor changes, making it difficult to identify true anomalies. By setting suppression rules, you can ignore minor deviations and focus on real anomalous patterns. + +To suppress anomalies for deviations of less than 30% from the expected value, you can set the following rules: -## Step 3: Set up detector jobs +``` +Ignore anomalies for feature logVolume when the actual value is no more than 30% above the expected value. +Ignore anomalies for feature logVolume when the actual value is no more than 30% below the expected value. +``` + +Ensure that a feature, for example, `logVolume`, is properly defined in your model. Suppression rules are tied to specific features. +{: .note} + +If you expect that the log volume should differ by at least 10,000 from the expected value before being considered an anomaly, you can set absolute thresholds: -To start a real-time detector to find anomalies in your data in near real-time, check **Start real-time detector automatically (recommended)**. +``` +Ignore anomalies for feature logVolume when the actual value is no more than 10000 above the expected value. +Ignore anomalies for feature logVolume when the actual value is no more than 10000 below the expected value. +``` -Alternatively, if you want to perform historical analysis and find patterns in long historical data windows (weeks or months), check **Run historical analysis detection** and select a date range (at least 128 detection intervals). +If no custom suppression rules are set, then the system defaults to a filter that ignores anomalies with deviations of less than 20% from the expected value for each enabled feature. -Analyzing historical data helps you get familiar with the Anomaly Detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it. +### Previewing sample anomalies -We recommend experimenting with historical analysis with different feature sets and checking the precision before moving on to real-time detectors. +You can preview anomalies based on sample feature input and adjust the feature settings as needed. The Anomaly Detection plugin selects a small number of data samples---for example, 1 data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. The sample dataset is loaded into the detector, which then uses the sample dataset to generate a preview of the anomalies. + +1. Choose **Preview sample anomalies**. + - If sample anomaly results are not displayed, check the detector interval to verify that 400 or more data points are set for the entities during the preview date range. +2. Select the **Next** button. -## Step 4: Review and create +## Step 3: Setting up detector jobs -Review your detector settings and model configurations to make sure that they're valid and then select **Create detector**. +To start a detector to find anomalies in your data in near real time, select **Start real-time detector automatically (recommended)**. -![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/review_ad.png) +Alternatively, if you want to perform historical analysis and find patterns in longer historical data windows (weeks or months), select the **Run historical analysis detection** box and select a date range of at least 128 detection intervals. -If you see any validation errors, edit the settings to fix the errors and then return back to this page. +Analyzing historical data can help to familiarize you with the Anomaly Detection plugin. For example, you can evaluate the performance of a detector against historical data in order to fine-tune it. + +You can experiment with historical analysis by using different feature sets and checking the precision before using real-time detectors. + +## Step 4: Reviewing detector settings + +Review your detector settings and model configurations to confirm that they are valid and then select **Create detector**. + +If a validation error occurs, edit the settings to correct the error and return to the detector page. {: .note } -## Step 5: Observe the results +## Step 5: Observing the results -Choose the **Real-time results** or **Historical analysis** tab. For real-time results, you need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, because its waiting for sufficient data to generate anomalies. +Choose either the **Real-time results** or **Historical analysis** tab. For real-time results, it will take some time to display the anomaly results. For example, if the detector interval is 10 minutes, then the detector may take an hour to initiate because it is waiting for sufficient data to be able to generate anomalies. -A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner. -Use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to make sure you have sufficient data points. +A shorter interval results in the model passing the shingle process more quickly and generating anomaly results sooner. You can use the [profile detector]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/api#profile-detector) operation to ensure that you have enough data points. -If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval. +If the detector is pending in "initialization" for longer than 1 day, aggregate your existing data and use the detector interval to check for any missing data points. If you find many missing data points, consider increasing the detector interval. -Choose and drag over the anomaly line chart to zoom in and see a more detailed view of an anomaly. +Click and drag over the anomaly line chart to zoom in and see a detailed view of an anomaly. {: .note } -Analyze anomalies with the following visualizations: +You can analyze anomalies using the following visualizations: -- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds. -- **Anomaly overview** (for real-time results) / **Anomaly history** (for historical analysis in the **Historical analysis** tab) plots the anomaly grade with the corresponding measure of confidence. This pane includes: +- **Live anomalies** (for real-time results) displays live anomaly results for the last 60 intervals. For example, if the interval is `10`, it shows results for the last 600 minutes. The chart refreshes every 30 seconds. +- **Anomaly overview** (for real-time results) or **Anomaly history** (for historical analysis on the **Historical analysis** tab) plot the anomaly grade with the corresponding measure of confidence. The pane includes: - The number of anomaly occurrences based on the given data-time range. - - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly. + - The **Average anomaly grade**, a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of `0` represents "not an anomaly," and a non-zero value represents the relative severity of the anomaly. - **Confidence** estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy. - **Last anomaly occurrence** is the time at which the last anomaly occurred. -Underneath **Anomaly overview**/**Anomaly history** are: +Underneath **Anomaly overview** or **Anomaly history** are: - **Feature breakdown** plots the features based on the aggregation method. You can vary the date-time range of the detector. Selecting a point on the feature line chart shows the **Feature output**, the number of times a field appears in your index, and the **Expected value**, a predicted value for the feature output. Where there is no anomaly, the output and expected values are equal. - ![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png) - - **Anomaly occurrences** shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly. Selecting a point on the anomaly line chart shows **Feature Contribution**, the percentage of a feature that contributes to the anomaly -![Anomaly detection results]({{site.url}}{{site.baseurl}}/images/feature-contribution-ad.png) - - If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0). @@ -249,7 +296,7 @@ To see all the configuration settings for a detector, choose the **Detector conf 1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**. - You need to stop real-time and historical analysis to change its configuration. Confirm that you want to stop the detector and proceed. -1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**. +2. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**. ## Step 8: Manage your detectors diff --git a/_observing-your-data/ad/result-mapping.md b/_observing-your-data/ad/result-mapping.md index 7e1482a013..967b185684 100644 --- a/_observing-your-data/ad/result-mapping.md +++ b/_observing-your-data/ad/result-mapping.md @@ -9,9 +9,7 @@ redirect_from: # Anomaly result mapping -If you enabled custom result index, the anomaly detection plugin stores the results in your own index. - -If the anomaly detector doesn’t detect an anomaly, the result has the following format: +When you select the **Enable custom result index** box on the **Custom result index** pane, the Anomaly Detection plugin will save the results to an index of your choosing. When the anomaly detector does not detect an anomaly, the result format is as follows: ```json { @@ -61,6 +59,7 @@ If the anomaly detector doesn’t detect an anomaly, the result has the followin "threshold": 1.2368549346675202 } ``` +{% include copy-curl.html %} ## Response body fields @@ -80,7 +79,83 @@ Field | Description `model_id` | A unique ID that identifies a model. If a detector is a single-stream detector (with no category field), it has only one model. If a detector is a high-cardinality detector (with one or more category fields), it might have multiple models, one for each entity. `threshold` | One of the criteria for a detector to classify a data point as an anomaly is that its `anomaly_score` must surpass a dynamic threshold. This field records the current threshold. -If an anomaly detector detects an anomaly, the result has the following format: +When the imputation option is enabled, the anomaly results include a `feature_imputed` array showing which features were modified due to missing data. If no features were imputed, then this is excluded. + +In the following example anomaly result output, the `processing_bytes_max` feature was imputed, as shown by the `imputed: true` status: + +```json +{ + "detector_id": "kzcZ43wBgEQAbjDnhzGF", + "schema_version": 5, + "data_start_time": 1635898161367, + "data_end_time": 1635898221367, + "feature_data": [ + { + "feature_id": "processing_bytes_max", + "feature_name": "processing bytes max", + "data": 2322 + }, + { + "feature_id": "processing_bytes_avg", + "feature_name": "processing bytes avg", + "data": 1718.6666666666667 + }, + { + "feature_id": "processing_bytes_min", + "feature_name": "processing bytes min", + "data": 1375 + }, + { + "feature_id": "processing_bytes_sum", + "feature_name": "processing bytes sum", + "data": 5156 + }, + { + "feature_id": "processing_time_max", + "feature_name": "processing time max", + "data": 31198 + } + ], + "execution_start_time": 1635898231577, + "execution_end_time": 1635898231622, + "anomaly_score": 1.8124904404395776, + "anomaly_grade": 0, + "confidence": 0.9802940756605277, + "entity": [ + { + "name": "process_name", + "value": "process_3" + } + ], + "model_id": "kzcZ43wBgEQAbjDnhzGF_entity_process_3", + "threshold": 1.2368549346675202, + "feature_imputed": [ + { + "feature_id": "processing_bytes_max", + "imputed": true + }, + { + "feature_id": "processing_bytes_avg", + "imputed": false + }, + { + "feature_id": "processing_bytes_min", + "imputed": false + }, + { + "feature_id": "processing_bytes_sum", + "imputed": false + }, + { + "feature_id": "processing_time_max", + "imputed": false + } + ] +} +``` +{% include copy-curl.html %} + +When an anomaly is detected, the result is provided in the following format: ```json { @@ -179,24 +254,23 @@ If an anomaly detector detects an anomaly, the result has the following format: "execution_start_time": 1635898427803 } ``` +{% include copy-curl.html %} -You can see the following additional fields: +Note that the result includes the following additional field. Field | Description :--- | :--- `relevant_attribution` | Represents the contribution of each input variable. The sum of the attributions is normalized to 1. `expected_values` | The expected value for each feature. -At times, the detector might detect an anomaly late. -Let's say the detector sees a random mix of the triples {1, 2, 3} and {2, 4, 5} that correspond to `slow weeks` and `busy weeks`, respectively. For example 1, 2, 3, 1, 2, 3, 2, 4, 5, 1, 2, 3, 2, 4, 5, ... and so on. -If the detector comes across a pattern {2, 2, X} and it's yet to see X, the detector infers that the pattern is anomalous, but it can't determine at this point which of the 2's is the cause. If X = 3, then the detector knows it's the first 2 in that unfinished triple, and if X = 5, then it's the second 2. If it's the first 2, then the detector detects the anomaly late. +The detector may be late in detecting an anomaly. For example: The detector observes a sequence of data that alternates between "slow weeks" (represented by the triples {1, 2, 3}) and "busy weeks" (represented by the triples {2, 4, 5}). If the detector comes across a pattern {2, 2, X}, where it has not yet seen the value that X will take, then the detector infers that the pattern is anomalous. However, it cannot determine which 2 is the cause. If X = 3, then the first 2 is the anomaly. If X = 5, then the second 2 is the anomaly. If it is the first 2, then the detector will be late in detecting the anomaly. -If a detector detects an anomaly late, the result has the following additional fields: +When a detector is late in detecting an anomaly, the result includes the following additional fields. Field | Description :--- | :--- -`past_values` | The actual input that triggered an anomaly. If `past_values` is null, the attributions or expected values are from the current input. If `past_values` is not null, the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]). -`approx_anomaly_start_time` | The approximate time of the actual input that triggers an anomaly. This field helps you understand when a detector flags an anomaly. Both single-stream and high-cardinality detectors don't query previous anomaly results because these queries are expensive operations. The cost is especially high for high-cardinality detectors that might have a lot of entities. If the data is not continuous, the accuracy of this field is low and the actual time that the detector detects an anomaly can be earlier. +`past_values` | The actual input that triggered an anomaly. If `past_values` is `null`, then the attributions or expected values are from the current input. If `past_values` is not `null`, then the attributions or expected values are from a past input (for example, the previous two steps of the data [1,2,3]). +`approx_anomaly_start_time` | The approximate time of the actual input that triggered an anomaly. This field helps you understand the time at which a detector flags an anomaly. Both single-stream and high-cardinality detectors do not query previous anomaly results because these queries are costly operations. The cost is especially high for high-cardinality detectors that may have many entities. If the data is not continuous, then the accuracy of this field is low and the actual time at which the detector detects an anomaly can be earlier. ```json { @@ -319,3 +393,4 @@ Field | Description "approx_anomaly_start_time": 1635883620000 } ``` +{% include copy-curl.html %} diff --git a/_observing-your-data/ad/security.md b/_observing-your-data/ad/security.md index 8eeaa3df41..e4816cec46 100644 --- a/_observing-your-data/ad/security.md +++ b/_observing-your-data/ad/security.md @@ -23,6 +23,11 @@ As an admin user, you can use the Security plugin to assign specific permissions The Security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). +If you use OpenSearch Dashboards to create your anomaly detectors, you may experience access issues even with `anomaly_full_access`. This issue has been resolved in OpenSearch 2.17, but for earlier versions, the following additional permissions need to be added: + +- `indices:data/read/search` -- You need this permission because the Anomaly Detection plugin needs to search the data source in order to validate whether there is enough data to train the model. +- `indices:admin/mappings/fields/get` and `indices:admin/mappings/fields/get*` -- You need these permissions to validate whether the given data source has a valid timestamp field and categorical field (in the case of creating a high-cardinality detector). + If these roles don't meet your needs, mix and match individual anomaly detection [permissions]({{site.url}}{{site.baseurl}}/security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors. ### A note on alerts and fine-grained access control @@ -31,6 +36,42 @@ When a trigger generates an alert, the detector and monitor configurations, the To reduce the chances of unintended users viewing metadata that could describe an index, we recommend that administrators enable role-based access control and keep these kinds of design elements in mind when assigning permissions to the intended group of users. See [Limit access by backend role](#advanced-limit-access-by-backend-role) for details. +### Selecting remote indexes with fine-grained access control + +To use a remote index as a data source for a detector, see the setup steps in [Authentication flow]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/#authentication-flow) in [Cross-cluster search]({{site.url}}{{site.baseurl}}/search-plugins/cross-cluster-search/). You must use a role that exists in both the remote and local clusters. The remote cluster must map the chosen role to the same username as in the local cluster. + +--- + +#### Example: Create a new user on the local cluster + +1. Create a new user on the local cluster to use for detector creation: + +``` +curl -XPUT -k -u 'admin:
+ Response +
+ {: .text-delta} + +```json +{ + "top_queries": [ + { + "timestamp": 1725495127359, + "source": { + "query": { + "match_all": { + "boost": 1.0 + } + } + }, + "phase_latency_map": { + "expand": 0, + "query": 55, + "fetch": 3 + }, + "total_shards": 1, + "node_id": "ZbINz1KFS1OPeFmN-n5rdg", + "query_hashcode": "b4c4f69290df756021ca6276be5cbb75", + "task_resource_usages": [ + { + "action": "indices:data/read/search[phase/query]", + "taskId": 30, + "parentTaskId": 29, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 33249000, + "memory_in_bytes": 2896848 + } + }, + { + "action": "indices:data/read/search", + "taskId": 29, + "parentTaskId": -1, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 3151000, + "memory_in_bytes": 133936 + } + } + ], + "indices": [ + "my_index" + ], + "labels": {}, + "search_type": "query_then_fetch", + "measurements": { + "latency": { + "number": 160, + "count": 10, + "aggregationType": "AVERAGE" + } + } + }, + { + "timestamp": 1725495135160, + "source": { + "query": { + "term": { + "content": { + "value": "first", + "boost": 1.0 + } + } + } + }, + "phase_latency_map": { + "expand": 0, + "query": 18, + "fetch": 0 + }, + "total_shards": 1, + "node_id": "ZbINz1KFS1OPeFmN-n5rdg", + "query_hashcode": "c3620cc3d4df30fb3f95aeb2167289a4", + "task_resource_usages": [ + { + "action": "indices:data/read/search[phase/query]", + "taskId": 50, + "parentTaskId": 49, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 10188000, + "memory_in_bytes": 288136 + } + }, + { + "action": "indices:data/read/search", + "taskId": 49, + "parentTaskId": -1, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 262000, + "memory_in_bytes": 3216 + } + } + ], + "indices": [ + "my_index" + ], + "labels": {}, + "search_type": "query_then_fetch", + "measurements": { + "latency": { + "number": 109, + "count": 7, + "aggregationType": "AVERAGE" + } + } + }, + { + "timestamp": 1725495139766, + "source": { + "query": { + "match": { + "content": { + "query": "first", + "operator": "OR", + "prefix_length": 0, + "max_expansions": 50, + "fuzzy_transpositions": true, + "lenient": false, + "zero_terms_query": "NONE", + "auto_generate_synonyms_phrase_query": true, + "boost": 1.0 + } + } + } + }, + "phase_latency_map": { + "expand": 0, + "query": 15, + "fetch": 0 + }, + "total_shards": 1, + "node_id": "ZbINz1KFS1OPeFmN-n5rdg", + "query_hashcode": "484eaabecd13db65216b9e2ff5eee999", + "task_resource_usages": [ + { + "action": "indices:data/read/search[phase/query]", + "taskId": 64, + "parentTaskId": 63, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 12161000, + "memory_in_bytes": 473456 + } + }, + { + "action": "indices:data/read/search", + "taskId": 63, + "parentTaskId": -1, + "nodeId": "ZbINz1KFS1OPeFmN-n5rdg", + "taskResourceUsage": { + "cpu_time_in_nanos": 293000, + "memory_in_bytes": 3216 + } + } + ], + "indices": [ + "my_index" + ], + "labels": {}, + "search_type": "query_then_fetch", + "measurements": { + "latency": { + "number": 43, + "count": 3, + "aggregationType": "AVERAGE" + } + } + } + ] +} +``` + +- `none`: Ignores the relevance scores of child documents and assigns a score of `0` to the parent document.
- `avg`: Uses the average relevance score of all matching child documents.
- `max`: Assigns the highest relevance score from the matching child documents to the parent.
- `min`: Assigns the lowest relevance score from the matching child documents to the parent.
- `sum`: Sums the relevance scores of all matching child documents.
Default is `none`. | +| `inner_hits` | Optional | If provided, returns the underlying hits (child documents) that matched the query. | + + +## Sorting limitations + +The `has_child` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort parent documents by fields in their child documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the parent document's score. + +In the preceding example, you can sort parent documents (brands) based on the `sales_count` of their child products. This query multiplies the score by the `sales_count` field of the child documents and assigns the highest relevance score from the matching child documents to the parent: + +```json +GET testindex1/_search +{ + "query": { + "has_child": { + "type": "product", + "query": { + "function_score": { + "script_score": { + "script": "_score * doc['sales_count'].value" + } + } + }, + "score_mode": "max" + } + } +} +``` +{% include copy-curl.html %} + +The response contains the brands sorted by the highest child `sales_count`: + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 300, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 300, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + }, + { + "_index": "testindex1", + "_id": "1", + "_score": 150, + "_source": { + "name": "Luxury brand", + "product_to_brand": "brand" + } + } + ] + } +} +``` + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/has-parent.md b/_query-dsl/joining/has-parent.md new file mode 100644 index 0000000000..6b293ffff2 --- /dev/null +++ b/_query-dsl/joining/has-parent.md @@ -0,0 +1,358 @@ +--- +layout: default +title: Has parent +parent: Joining queries +nav_order: 20 +--- + +# Has parent query + +The `has_parent` query returns child documents whose parent documents match a specific query. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type. + +The `has_parent` query is slower than other queries because of the join operation it performs. Performance decreases as the number of matching parent documents increases. Each `has_parent` query in your search may significantly impact query performance. If you prioritize speed, avoid using this query or limit its usage as much as possible. +{: .warning} + +## Example + +Before you can run a `has_parent` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format: + +```json +PUT /example_index +{ + "mappings": { + "properties": { + "relationship_field": { + "type": "join", + "relations": { + "parent_doc": "child_doc" + } + } + } + } +} +``` +{% include copy-curl.html %} + +For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/). + +To search for the child of a parent, use a `has_parent` query. The following query returns child documents (products) made by the brand matching the query `economy`: + +```json +GET testindex1/_search +{ + "query" : { + "has_parent": { + "parent_type":"brand", + "query": { + "match" : { + "name": "economy" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response returns all products made by the brand: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "4", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + } + ] + } +} +``` + +## Retrieving inner hits + +To return parent documents that matched the query, provide the `inner_hits` parameter: + +```json +GET testindex1/_search +{ + "query" : { + "has_parent": { + "parent_type":"brand", + "query": { + "match" : { + "name": "economy" + } + }, + "inner_hits": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains parent documents in the `inner_hits` field: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "testindex1", + "_id": "4", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + }, + "inner_hits": { + "brand": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 1.3862942, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + } + ] + } + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 1, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + }, + "inner_hits": { + "brand": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "testindex1", + "_id": "2", + "_score": 1.3862942, + "_source": { + "name": "Economy brand", + "product_to_brand": "brand" + } + } + ] + } + } + } + } + ] + } +} +``` + +For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + +## Parameters + +The following table lists all top-level parameters supported by `has_parent` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `parent_type` | Required | Specifies the name of the parent relationship as defined in the `join` field mapping. | +| `query` | Required | The query to run on parent documents. If a parent document matches the query, the child document is returned. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `parent_type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `parent_type` field. Default is `false`. | +| `score` | Optional | Indicates whether the relevance score of a matching parent document is aggregated into its child documents. If `false`, then the relevance score of the parent document is ignored, and each child document is assigned a relevance score equal to the query's `boost`, which defaults to `1`. If `true`, then the relevance score of the matching parent document is aggregated into the relevance scores of its child documents. Default is `false`. | +| `inner_hits` | Optional | If provided, returns the underlying hits (parent documents) that matched the query. | + + +## Sorting limitations + +The `has_parent` query does not support [sorting results]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/sort/) using standard sorting options. If you need to sort child documents by fields in their parent documents, you can use a [`function_score` query]({{site.url}}{{site.baseurl}}/query-dsl/compound/function-score/) and sort by the child document's score. + +For the preceding example, first add a `customer_satisfaction` field by which you'll sort the child documents belonging to the parent (brand) documents: + +```json +PUT testindex1/_doc/1 +{ + "name": "Luxury watch brand", + "product_to_brand" : "brand", + "customer_satisfaction": 4.5 +} +``` +{% include copy-curl.html %} + +```json +PUT testindex1/_doc/2 +{ + "name": "Economy watch brand", + "product_to_brand" : "brand", + "customer_satisfaction": 3.9 +} +``` +{% include copy-curl.html %} + +Now you can sort child documents (products) based on the `customer_satisfaction` field of their parent brands. This query multiplies the score by the `customer_satisfaction` field of the parent documents: + +```json +GET testindex1/_search +{ + "query": { + "has_parent": { + "parent_type": "brand", + "score": true, + "query": { + "function_score": { + "script_score": { + "script": "_score * doc['customer_satisfaction'].value" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the products, sorted by the highest parent `customer_satisfaction`: + +```json +{ + "took": 11, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 4.5, + "hits": [ + { + "_index": "testindex1", + "_id": "3", + "_score": 4.5, + "_routing": "1", + "_source": { + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } + } + }, + { + "_index": "testindex1", + "_id": "4", + "_score": 3.9, + "_routing": "2", + "_source": { + "name": "Electronic watch", + "sales_count": 300, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + }, + { + "_index": "testindex1", + "_id": "5", + "_score": 3.9, + "_routing": "2", + "_source": { + "name": "Digital watch", + "sales_count": 100, + "product_to_brand": { + "name": "product", + "parent": "2" + } + } + } + ] + } +} +``` + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/index.md b/_query-dsl/joining/index.md index 20f48c0b16..f0a0060640 100644 --- a/_query-dsl/joining/index.md +++ b/_query-dsl/joining/index.md @@ -3,16 +3,22 @@ layout: default title: Joining queries has_children: true nav_order: 55 +has_toc: false +redirect_from: + - /query-dsl/joining/ --- # Joining queries OpenSearch is a distributed system in which data is spread across multiple nodes. Thus, running a SQL-like JOIN operation in OpenSearch is resource intensive. As an alternative, OpenSearch provides the following queries that perform join operations and are optimized for scaling across multiple nodes: -- `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. -- `has_child` queries: Search for parent documents whose child documents match the query. -- `has_parent` queries: Search for child documents whose parent documents match the query. -- `parent_id` queries: A [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field type establishes a parent/child relationship between documents in the same index. `parent_id` queries search for child documents that are joined to a specific parent document. + +- Queries for searching [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields: + - `nested` queries: Act as wrappers for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. +- Queries for searching documents connected by a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type, which establishes a parent/child relationship between documents in the same index: + - [`has_child`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/) queries: Search for parent documents whose child documents match the query. + - [`has_parent`]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-parent/) queries: Search for child documents whose parent documents match the query. + - [`parent_id`]({{site.url}}{{site.baseurl}}/query-dsl/joining/parent-id/) queries: Search for child documents that are joined to a specific parent document. If [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `false`, then joining queries are not executed. {: .important} \ No newline at end of file diff --git a/_query-dsl/joining/nested.md b/_query-dsl/joining/nested.md new file mode 100644 index 0000000000..431a40ed1a --- /dev/null +++ b/_query-dsl/joining/nested.md @@ -0,0 +1,347 @@ +--- +layout: default +title: Nested +parent: Joining queries +nav_order: 30 +--- + +# Nested query + +The `nested` query acts as a wrapper for other queries to search [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) fields. The nested field objects are searched as though they were indexed as separate documents. If an object matches the search, the `nested` query returns the parent document at the root level. + +## Example + +Before you can run a `nested` query, your index must contain a [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) field. + +To configure an example index containing nested fields, send the following request: + +```json +PUT /testindex +{ + "mappings": { + "properties": { + "patient": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "age": { + "type": "integer" + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index a document into the example index: + +```json +PUT /testindex/_doc/1 +{ + "patient": { + "name": "John Doe", + "age": 56 + } +} +``` +{% include copy-curl.html %} + +To search the nested `patient` field, wrap your query in a `nested` query and provide the `path` to the nested field: + +```json +GET /testindex/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "match": { + "patient.name": "John" + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the matching document: + +```json +{ + "took": 3, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 0.2876821, + "_source": { + "patient": { + "name": "John Doe", + "age": 56 + } + } + } + ] + } +} +``` + +## Retrieving inner hits + +To return inner hits that matched the query, provide the `inner_hits` parameter: + +```json +GET /testindex/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "match": { + "patient.name": "John" + } + }, + "inner_hits": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains the additional `inner_hits` field. The `_nested` field identifies the specific inner object from which the inner hit originated. It contains the nested hit and the offset relative to its position in the `_source`. Because of sorting and scoring, the position of the hit objects in `inner_hits` often differs from their original location in the nested object. + +By default, the `_source` of the hit objects within `inner_hits` is returned relative to the `_nested` field. In this example, the `_source` within `inner_hits` contains the `name` and `age` fields as opposed to the top-level `_source`, which contains the whole `patient` object: + +```json +{ + "took": 38, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_score": 0.2876821, + "_source": { + "patient": { + "name": "John Doe", + "age": 56 + } + }, + "inner_hits": { + "patient": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.2876821, + "hits": [ + { + "_index": "testindex", + "_id": "1", + "_nested": { + "field": "patient", + "offset": 0 + }, + "_score": 0.2876821, + "_source": { + "name": "John Doe", + "age": 56 + } + } + ] + } + } + } + } + ] + } +} +``` + +You can disable returning `_source` by configuring the `_source` field in the mappings. For more information, see [Source]({{site.url}}{{site.baseurl}}/field-types/metadata-fields/source/). +{: .tip} + +For more information about retrieving inner hits, see [Inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). + +## Multi-level nested queries + +You can search documents that have nested objects inside other nested objects using multi-level nested queries. In this example, you'll query multiple layers of nested fields by specifying a nested query for each level of the hierarchy. + +First, create an index with multi-level nested fields: + +```json +PUT /patients +{ + "mappings": { + "properties": { + "patient": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "contacts": { + "type": "nested", + "properties": { + "name": { + "type": "text" + }, + "relationship": { + "type": "text" + }, + "phone": { + "type": "keyword" + } + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +Next, index a document into the example index: + +```json +PUT /patients/_doc/1 +{ + "patient": { + "name": "John Doe", + "contacts": [ + { + "name": "Jane Doe", + "relationship": "mother", + "phone": "5551111" + }, + { + "name": "Joe Doe", + "relationship": "father", + "phone": "5552222" + } + ] + } +} +``` +{% include copy-curl.html %} + +To search the nested `patient` field, use a multi-level `nested` query. The following query searches for patients whose contact information includes a person named `Jane` with a relationship of `mother`: + +```json +GET /patients/_search +{ + "query": { + "nested": { + "path": "patient", + "query": { + "nested": { + "path": "patient.contacts", + "query": { + "bool": { + "must": [ + { "match": { "patient.contacts.relationship": "mother" } }, + { "match": { "patient.contacts.name": "Jane" } } + ] + } + } + } + } + } + } +} +``` +{% include copy-curl.html %} + +The query returns the patient who has a contact entry matching these details: + +```json +{ + "took": 14, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.3862942, + "hits": [ + { + "_index": "patients", + "_id": "1", + "_score": 1.3862942, + "_source": { + "patient": { + "name": "John Doe", + "contacts": [ + { + "name": "Jane Doe", + "relationship": "mother", + "phone": "5551111" + }, + { + "name": "Joe Doe", + "relationship": "father", + "phone": "5552222" + } + ] + } + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `nested` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `path` | Required | Specifies the path to the nested object that you want to search. | +| `query` | Required | The query to run on the nested objects within the specified `path`. If a nested object matches the query, the root parent document is returned. You can search nested fields using dot notation, such as `nested_object.subfield`. Multi-level nesting is supported and automatically detected. Thus, an inner `nested` query within another nested query automatically matches the correct nesting level, instead of the root. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `path` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `path` field. Default is `false`. | +| `score_mode` | Optional | Defines how scores of matching inner documents influence the parent document's score. Valid values are:
- `avg`: Uses the average relevance score of all matching inner documents.
- `max`: Assigns the highest relevance score from the matching inner documents to the parent.
- `min`: Assigns the lowest relevance score from the matching inner documents to the parent.
- `sum`: Sums the relevance scores of all matching inner documents.
- `none`: Ignores the relevance scores of inner documents and assigns a score of `0` to the parent document.
Default is `avg`. | +| `inner_hits` | Optional | If provided, returns the underlying hits that matched the query. | + +## Next steps + +- Learn more about [retrieving inner hits]({{site.url}}{{site.baseurl}}/search-plugins/searching-data/inner-hits/). \ No newline at end of file diff --git a/_query-dsl/joining/parent-id.md b/_query-dsl/joining/parent-id.md new file mode 100644 index 0000000000..cbf86a796e --- /dev/null +++ b/_query-dsl/joining/parent-id.md @@ -0,0 +1,96 @@ +--- +layout: default +title: Parent ID +parent: Joining queries +nav_order: 40 +--- + +# Parent ID query + +The `parent_id` query returns child documents whose parent document has the specified ID. You can establish parent/child relationships between documents in the same index by using a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field type. + +## Example + +Before you can run a `parent_id` query, your index must contain a [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) field in order to establish parent/child relationships. The index mapping request uses the following format: + +```json +PUT /example_index +{ + "mappings": { + "properties": { + "relationship_field": { + "type": "join", + "relations": { + "parent_doc": "child_doc" + } + } + } + } +} +``` +{% include copy-curl.html %} + +For this example, first configure an index that contains documents representing products and their brands as described in the [`has_child` query example]({{site.url}}{{site.baseurl}}/query-dsl/joining/has-child/). + +To search for child documents of a specific parent document, use a `parent_id` query. The following query returns child documents (products) whose parent document has the ID `1`: + +```json +GET testindex1/_search +{ + "query": { + "parent_id": { + "type": "product", + "id": "1" + } + } +} +``` +{% include copy-curl.html %} + +The response returns the child product: + +```json +{ + "took": 57, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.87546873, + "hits": [ + { + "_index": "testindex1", + "_id": "3", + "_score": 0.87546873, + "_routing": "1", + "_source": { + "name": "Mechanical watch", + "sales_count": 150, + "product_to_brand": { + "name": "product", + "parent": "1" + } + } + } + ] + } +} +``` + +## Parameters + +The following table lists all top-level parameters supported by `parent_id` queries. + +| Parameter | Required/Optional | Description | +|:---|:---|:---| +| `type` | Required | Specifies the name of the child relationship as defined in the `join` field mapping. | +| `id` | Required | The ID of the parent document. The query returns child documents associated with this parent document. | +| `ignore_unmapped` | Optional | Indicates whether to ignore unmapped `type` fields and not return documents instead of throwing an error. You can provide this parameter when querying multiple indexes, some of which may not contain the `type` field. Default is `false`. | \ No newline at end of file diff --git a/_query-dsl/specialized/neural.md b/_query-dsl/specialized/neural.md index 14b930cdb6..6cd534b87f 100644 --- a/_query-dsl/specialized/neural.md +++ b/_query-dsl/specialized/neural.md @@ -35,6 +35,8 @@ Field | Data type | Required/Optional | Description `min_score` | Float | Optional | The minimum score threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). `max_distance` | Float | Optional | The maximum distance threshold for the search results. Only one variable, either `k`, `min_score`, or `max_distance`, can be specified. For more information, see [k-NN radial search]({{site.url}}{{site.baseurl}}/search-plugins/knn/radial-search-knn/). `filter` | Object | Optional | A query that can be used to reduce the number of documents considered. For more information about filter usage, see [k-NN search with filters]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/). **Important**: Filter can only be used with the `faiss` or `lucene` engines. +`method_parameters` | Object | Optional | Parameters passed to the k-NN index during search. See [Additional query parameters]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#additional-query-parameters). +`rescore` | Object | Optional | Parameters for configuring rescoring functionality for k-NN indexes built using quantization. See [Rescoring]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#rescoring-quantized-results-using-full-precision). #### Example request diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 42c74c0436..7dac6a9619 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -39,6 +39,7 @@ Parameter | Data type | Description :--- | :--- | :--- `
`float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)`
In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score. hamming | `float hamming (float[] queryVector, doc['vector field'])` | This function calculates the Hamming distance between a given query vector and document vectors. The Hamming distance is the number of positions at which the corresponding elements are different. The shorter the distance, the more relevant the document is, so this example inverts the return value of the Hamming distance. -The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +The `hamming` space type is supported for binary vectors in OpenSearch version 2.16 and later. For more information, see [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). {: .note} ## Constraints diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md index 123b1daef1..77f44dee93 100644 --- a/_search-plugins/knn/performance-tuning.md +++ b/_search-plugins/knn/performance-tuning.md @@ -59,9 +59,9 @@ The `_source` field contains the original JSON document body that was passed at "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss" } } @@ -85,9 +85,9 @@ In OpenSearch 2.15 or later, you can further improve indexing speed and reduce d "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss" } } diff --git a/_search-plugins/knn/radial-search-knn.md b/_search-plugins/knn/radial-search-knn.md index 1a4a223294..e5449a0993 100644 --- a/_search-plugins/knn/radial-search-knn.md +++ b/_search-plugins/knn/radial-search-knn.md @@ -53,9 +53,9 @@ PUT knn-index-test "my_vector": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "faiss", "parameters": { "ef_construction": 100, diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md index 1b9aa3608c..e4731ec94c 100644 --- a/_search-plugins/knn/settings.md +++ b/_search-plugins/knn/settings.md @@ -27,6 +27,7 @@ Setting | Static/Dynamic | Default | Description `knn.model.index.number_of_replicas`| Dynamic | `1` | The number of replica shards to use for the model system index. Generally, in a multi-node cluster, this value should be at least 1 in order to increase stability. `knn.model.cache.size.limit` | Dynamic | `10%` | The model cache limit cannot exceed 25% of the JVM heap. `knn.faiss.avx2.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx2.so` library and load the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). +`knn.faiss.avx512.disabled` | Static | `false` | A static setting that specifies whether to disable the SIMD-based `libopensearchknn_faiss_avx512.so` library and load the `libopensearchknn_faiss_avx2.so` library or the non-optimized `libopensearchknn_faiss.so` library for the Faiss engine on machines with x64 architecture. For more information, see [SIMD optimization for the Faiss engine]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#simd-optimization-for-the-faiss-engine). ## Index settings diff --git a/_search-plugins/search-pipelines/using-search-pipeline.md b/_search-plugins/search-pipelines/using-search-pipeline.md index ecb988ad11..b6dbbdc5d0 100644 --- a/_search-plugins/search-pipelines/using-search-pipeline.md +++ b/_search-plugins/search-pipelines/using-search-pipeline.md @@ -17,14 +17,45 @@ You can use a search pipeline in the following ways: ## Specifying an existing search pipeline for a request -After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query by specifying the pipeline name in the `search_pipeline` query parameter: +After you [create a search pipeline]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/creating-search-pipeline/), you can use the pipeline with a query in the following ways. For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example). + +### Specifying the pipeline in a query parameter + +You can specify the pipeline name in the `search_pipeline` query parameter as follows: ```json GET /my_index/_search?search_pipeline=my_pipeline ``` {% include copy-curl.html %} -For a complete example of using a search pipeline with a `filter_query` processor, see [`filter_query` processor example]({{site.url}}{{site.baseurl}}/search-plugins/search-pipelines/filter-query-processor#example). +### Specifying the pipeline in the request body + +You can provide a search pipeline ID in the search request body as follows: + +```json +GET /my-index/_search +{ + "query": { + "match_all": {} + }, + "from": 0, + "size": 10, + "search_pipeline": "my_pipeline" +} +``` +{% include copy-curl.html %} + +For multi-search, you can provide a search pipeline ID in the search request body as follows: + +```json +GET /_msearch +{ "index": "test"} +{ "query": { "match_all": {} }, "from": 0, "size": 10, "search_pipeline": "my_pipeline"} +{ "index": "test-1", "search_type": "dfs_query_then_fetch"} +{ "query": { "match_all": {} }, "search_pipeline": "my_pipeline1" } + +``` +{% include copy-curl.html %} ## Using a temporary search pipeline for a request diff --git a/_search-plugins/searching-data/inner-hits.md b/_search-plugins/searching-data/inner-hits.md index 395e9e748a..38fc7a491d 100644 --- a/_search-plugins/searching-data/inner-hits.md +++ b/_search-plugins/searching-data/inner-hits.md @@ -139,8 +139,8 @@ The preceding query searches for nested user objects containing the name John an } } ``` -## Inner hits with parent-child objects -Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent-child objects. +## Inner hits with parent/child objects +Parent-join relationships allow you to create relationships between documents of different types within the same index. The following example request searches with `inner_hits` using parent/child objects. 1. Create an index with a parent-join field: @@ -806,4 +806,8 @@ The following is the expected result: Using `inner_hits` provides contextual relevance by showing exactly which nested or child documents match the query criteria. This is crucial for applications in which the relevance of results depends on a specific part of the document that matches the query. - Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search. \ No newline at end of file + Example use case: In a customer support system, you have tickets as parent documents and comments or updates as nested or child documents. You can determine which specific comment matches the search in order to better understand the context of the ticket search. + +## Next steps + +- Learn about [joining queries]({{site.url}}{{site.baseurl}}/query-dsl/joining/) on [nested]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/nested/) or [join]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/join/) fields. \ No newline at end of file diff --git a/_search-plugins/vector-search.md b/_search-plugins/vector-search.md index cd893f4144..f19030bf90 100644 --- a/_search-plugins/vector-search.md +++ b/_search-plugins/vector-search.md @@ -37,9 +37,9 @@ PUT test-index "my_vector1": { "type": "knn_vector", "dimension": 1024, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "nmslib", "parameters": { "ef_construction": 128, @@ -57,7 +57,7 @@ PUT test-index You must designate the field that will store vectors as a [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type. OpenSearch supports vectors of up to 16,000 dimensions, each of which is represented as a 32-bit or 16-bit float. -To save storage space, you can use `byte` or `binary` vectors. For more information, see [Lucene byte vector]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#lucene-byte-vector) and [Binary k-NN vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-k-nn-vectors). +To save storage space, you can use `byte` or `binary` vectors. For more information, see [Byte vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#byte-vectors) and [Binary vectors]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector#binary-vectors). ### k-NN vector search @@ -131,9 +131,9 @@ PUT /hotels-index "location": { "type": "knn_vector", "dimension": 2, + "space_type": "l2", "method": { "name": "hnsw", - "space_type": "l2", "engine": "lucene", "parameters": { "ef_construction": 100, diff --git a/_security-analytics/threat-intelligence/getting-started.md b/_security-analytics/threat-intelligence/getting-started.md index 366bc2674c..b26063bed0 100644 --- a/_security-analytics/threat-intelligence/getting-started.md +++ b/_security-analytics/threat-intelligence/getting-started.md @@ -50,15 +50,64 @@ Local files uploaded as the threat intelligence source must use the following sp When using the `S3_SOURCE` as a remote store, the following connection information must be provided: -- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. -- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. -- **Specify a directory or file**: The object key or directory path for the `STIX2` file in the S3 bucket. +- **IAM Role ARN**: The Amazon Resource Name (ARN) for an AWS Identity and Access Management (IAM) role. When using the AWS OpenSearch Service, the role ARN needs to be in the same account as the OpenSearch domain. For more information about adding a new role for the AWS OpenSearch Service, see [Add service ARN](#add-aws-opensearch-service-arn). +- **S3 bucket directory**: The name of the Amazon Simple Storage Service (Amazon S3) bucket in which the `STIX2` file is stored. To access an S3 bucket in a different AWS account, see the [Cross-account S3 bucket connection](#cross-account-s3-bucket-connection) section for more details. +- **Specify a file**: The object key for the `STIX2` file in the S3 bucket. - **Region**: The AWS Region for the S3 bucket. You can also set the **Download schedule**, which determines to where OpenSearch downloads an updated `STIX2` file from the connected S3 bucket. The default interval is once a day. Only daily intervals are supported. Alternatively, you can check the **Download on demand** option, which prevents new data from the bucket from being automatically downloaded. +#### Add AWS OpenSearch Service ARN + +If you're using the AWS OpenSearch Service, create a new ARN role with a custom trust policy. For instructions on how to create the role, see [Creating a role for an AWS service](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-service.html#roles-creatingrole-service-console). + +When creating the role, customize the following settings: + +- Add the following custom trust policy: + + ```bash + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "opensearchservice.amazonaws.com" + ] + }, + "Action": "sts:AssumeRole" + } + ] + } + ``` + +- On the Permissions policies page, add the `AmazonS3ReadOnlyAccess` permission. + + +#### Cross-account S3 bucket connection + +Because the role ARN needs to be in the same account as the OpenSearch domain, a trust policy needs to be configured that allows the OpenSearch domain to download from S3 buckets from the same account. + +To download from an S3 bucket in another account, the trust policy for that bucket needs to give the role ARN permission to read from the object, as shown in the following example: + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::123456789012:role/account-1-threat-intel-role" + }, + "Action": "s3:*", + "Resource": "arn:aws:s3:::account-2-threat-intel-bucket/*" + } + ] +} +``` ## Step 2: Set up scanning for your log sources diff --git a/_security/access-control/document-level-security.md b/_security/access-control/document-level-security.md index 352fe06a61..b17b60e147 100644 --- a/_security/access-control/document-level-security.md +++ b/_security/access-control/document-level-security.md @@ -13,6 +13,8 @@ Document-level security lets you restrict a role to a subset of documents in an ![Document- and field-level security screen in OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/images/security-dls.png) +The maximum size for the document-level security configuration is 1024 KB (1,048,404 characters). +{: .warning} ## Simple roles diff --git a/_security/audit-logs/index.md b/_security/audit-logs/index.md index becb001ec0..8eeea33447 100644 --- a/_security/audit-logs/index.md +++ b/_security/audit-logs/index.md @@ -224,3 +224,36 @@ plugins.security.audit.config.threadpool.max_queue_len: 100000 To disable audit logs after they've been enabled, remove the `plugins.security.audit.type: internal_opensearch` setting from `opensearch.yml`, or switch off the **Enable audit logging** check box in OpenSearch Dashboards. +## Audit user account manipulation + +To enable audit logging on changes to a security index, such as changes to roles mappings and role creation or deletion, use the following settings in the `compliance:` portion of the audit log configuration, as shown in the following example: + +``` +_meta: + type: "audit" + config_version: 2 + +config: + # enable/disable audit logging + enabled: true + + ... + + + compliance: + # enable/disable compliance + enabled: true + + # Log updates to internal security changes + internal_config: true + + # Log only metadata of the document for write events + write_metadata_only: false + + # Log only diffs for document updates + write_log_diffs: true + + # List of indices to watch for write events. Wildcard patterns are supported + # write_watched_indices: ["twitter", "logs-*"] + write_watched_indices: [".opendistro_security"] +``` diff --git a/_security/authentication-backends/jwt.md b/_security/authentication-backends/jwt.md index 3f28dfecfd..6c7311e7dc 100644 --- a/_security/authentication-backends/jwt.md +++ b/_security/authentication-backends/jwt.md @@ -117,7 +117,7 @@ The following table lists the configuration parameters. Name | Description :--- | :--- -`signing_key` | The signing key to use when verifying the token. If you use a symmetric key algorithm, it is the base64-encoded shared secret. If you use an asymmetric algorithm, it contains the public key. +`signing_key` | The signing key(s) used to verify the token. If you use a symmetric key algorithm, this is the Base64-encoded shared secret. If you use an asymmetric algorithm, the algorithm contains the public key. To pass multiple keys, use a comma-separated list or enumerate the keys. `jwt_header` | The HTTP header in which the token is transmitted. This is typically the `Authorization` header with the `Bearer` schema,`Authorization: Bearer