Merge branch 'main' into asymmetric-embeddings-620

Signed-off-by: Martin Gaievski <[email protected]>
opensearch-project · Jun 11, 2024 · 400cace · 400cace
2 parents 6262b57 + 81e9630
commit 400cace
Show file tree

Hide file tree

Showing 62 changed files with 3,856 additions and 856 deletions.
diff --git a/.github/workflows/backwards_compatibility_tests_workflow.yml b/.github/workflows/backwards_compatibility_tests_workflow.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         java: [ 11, 17, 21 ]
         os: [ubuntu-latest,windows-latest]
-        bwc_version : ["2.9.0","2.10.0","2.11.0","2.12.0","2.13.0","2.14.0-SNAPSHOT"]
+        bwc_version : ["2.9.0","2.10.0","2.11.0","2.12.0","2.13.0","2.14.0"]
         opensearch_version : [ "3.0.0-SNAPSHOT" ]
 
     name: NeuralSearch Restart-Upgrade BWC Tests
@@ -42,7 +42,7 @@ jobs:
       matrix:
         java: [ 11, 17, 21 ]
         os: [ubuntu-latest,windows-latest]
-        bwc_version: [ "2.14.0-SNAPSHOT" ]
+        bwc_version: [ "2.15.0-SNAPSHOT" ]
         opensearch_version: [ "3.0.0-SNAPSHOT" ]
 
     name: NeuralSearch Rolling-Upgrade BWC Tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,29 +7,27 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Features
 ### Enhancements
 ### Bug Fixes
-- Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438))
-- Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578))
-- Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615))
-- Add max_token_score field placeholder in NeuralSparseQueryBuilder to fix the rolling-upgrade from 2.x nodes bwc tests. ([#696](https://github.com/opensearch-project/neural-search/pull/696))
 ### Infrastructure
-- Adding integration tests for scenario of hybrid query with aggregations ([#632](https://github.com/opensearch-project/neural-search/pull/632))
 ### Documentation
 ### Maintenance
 ### Refactoring
 
-## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.13...2.x)
+## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.14...2.x)
 ### Features
 - Support k-NN radial search parameters in neural search([#697](https://github.com/opensearch-project/neural-search/pull/697))
 - Add support for asymmetric embedding models ([#710](https://github.com/opensearch-project/neural-search/pull/710))
+- Speed up NeuralSparseQuery by two-phase using a custom search pipeline.([#646](https://github.com/opensearch-project/neural-search/issues/646))
+- Support batchExecute in TextEmbeddingProcessor and SparseEncodingProcessor ([#743](https://github.com/opensearch-project/neural-search/issues/743))
 ### Enhancements
-- BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
-- Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
-- Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
-- Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706))
+- Pass empty doc collector instead of top docs collector to improve hybrid query latencies by 20% ([#731](https://github.com/opensearch-project/neural-search/pull/731))
+- Optimize parameter parsing in text chunking processor ([#733](https://github.com/opensearch-project/neural-search/pull/733))
+- Use lazy initialization for priority queue of hits and scores to improve latencies by 20% ([#746](https://github.com/opensearch-project/neural-search/pull/746))
+- Optimize max score calculation in the Query Phase of the Hybrid Search ([765](https://github.com/opensearch-project/neural-search/pull/765))
 ### Bug Fixes
-- Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
+- Total hit count fix in Hybrid Query ([756](https://github.com/opensearch-project/neural-search/pull/756))
+- Fix map type validation issue in multiple pipeline processors ([#661](https://github.com/opensearch-project/neural-search/pull/661))
 ### Infrastructure
+- Disable memory circuit breaker for integ tests ([#770](https://github.com/opensearch-project/neural-search/pull/770))
 ### Documentation
 ### Maintenance
-- Update bwc tests for neural_query_enricher neural_sparse search ([#652](https://github.com/opensearch-project/neural-search/pull/652))
 ### Refactoring
diff --git a/SECURITY.md b/SECURITY.md
@@ -1,3 +1,3 @@
 ## Reporting a Vulnerability
 
-If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/) or directly via email to aws-security@amazon.com. Please do **not** create a public GitHub issue.
+If you discover a potential security issue in this project we ask that you notify OpenSearch Security directly via email to security@opensearch.org. Please do **not** create a public GitHub issue.
diff --git a/build.gradle b/build.gradle
@@ -396,7 +396,7 @@ testClusters.integTest {
 
     // Increase heap size from default of 512mb to 1gb. When heap size is 512mb, our integ tests sporadically fail due
     // to ml-commons memory circuit breaker exception
-    jvmArgs("-Xms1g", "-Xmx4g")
+    jvmArgs("-Xms1g", "-Xmx2g")
 }
 
 // Remote Integration Tests

diff --git a/gradle.properties b/gradle.properties
@@ -7,8 +7,8 @@
 # https://github.com/opensearch-project/OpenSearch/blob/main/libs/core/src/main/java/org/opensearch/Version.java .
 # Wired compatibility of OpenSearch works like 3.x version is compatible with 2.(latest-major) version.
 # Therefore, to run rolling-upgrade BWC Test on local machine the BWC version here should be set 2.(latest-major).
-systemProp.bwc.version=2.14.0-SNAPSHOT
-systemProp.bwc.bundle.version=2.14.0
+systemProp.bwc.version=2.15.0-SNAPSHOT
+systemProp.bwc.bundle.version=2.15.0
 
 # For fixing Spotless check with Java 17
 org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \

diff --git a/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java b/...estart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/TextChunkingProcessorIT.java
@@ -28,7 +28,7 @@ public class TextChunkingProcessorIT extends AbstractRestartUpgradeRestTestCase
         "standard tokenizer in OpenSearch."
     );
 
-    // Test rolling-upgrade text chunking processor
+    // Test restart-upgrade text chunking processor
     // Create Text Chunking Processor, Ingestion Pipeline and add document
     // Validate process, pipeline and document count in restart-upgrade scenario
     public void testTextChunkingProcessor_E2EFlow() throws Exception {

diff --git a/qa/rolling-upgrade/build.gradle b/qa/rolling-upgrade/build.gradle
@@ -203,6 +203,13 @@ task testAgainstTwoThirdsUpgradedCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the k-NN radial search tests because we introduce this feature in 2.14
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12") || ext.neural_search_bwc_version.startsWith("2.13")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.KnnRadialSearchIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'
@@ -252,6 +259,13 @@ task testRollingUpgrade(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the k-NN radial search tests because we introduce this feature in 2.14
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10") || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12") || ext.neural_search_bwc_version.startsWith("2.13")){
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.KnnRadialSearchIT.*"
+        }
+    }
+
     nonInputProperties.systemProperty('tests.rest.cluster', "${-> testClusters."${baseName}".allHttpSocketURI.join(",")}")
     nonInputProperties.systemProperty('tests.clustername', "${-> testClusters."${baseName}".getName()}")
     systemProperty 'tests.security.manager', 'false'

diff --git a/release-notes/opensearch-neural-search.release-notes-2.14.0.0.md b/release-notes/opensearch-neural-search.release-notes-2.14.0.0.md
@@ -0,0 +1,24 @@
+## Version 2.14.0.0 Release Notes
+
+Compatible with OpenSearch 2.14.0
+
+### Features
+* Support k-NN radial search parameters in neural search([#697](https://github.com/opensearch-project/neural-search/pull/697))
+### Enhancements
+* BWC tests for text chunking processor ([#661](https://github.com/opensearch-project/neural-search/pull/661))
+* Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663))
+* Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670))
+* Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693))
+* Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706))
+* Removed map of subquery to subquery index in favor of storing index as part of disi wrapper to improve hybrid query latencies by 20% ([#711](https://github.com/opensearch-project/neural-search/pull/711))
+* Avoid change max_chunk_limit exceed exception in text chunking processor ([#717](https://github.com/opensearch-project/neural-search/pull/717))
+### Bug Fixes
+* Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438))
+* Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578))
+* Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615))
+* Add max_token_score field placeholder in NeuralSparseQueryBuilder to fix the rolling-upgrade from 2.x nodes bwc tests. ([#696](https://github.com/opensearch-project/neural-search/pull/696))
+* Fix multi node "no such index" error in text chunking processor. ([#713](https://github.com/opensearch-project/neural-search/pull/713))
+### Infrastructure
+* Adding integration tests for scenario of hybrid query with aggregations ([#632](https://github.com/opensearch-project/neural-search/pull/632))
+### Maintenance
+* Update bwc tests for neural_query_enricher neural_sparse search ([#652](https://github.com/opensearch-project/neural-search/pull/652))
diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
@@ -27,6 +27,7 @@
 import org.opensearch.ml.client.MachineLearningNodeClient;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 import org.opensearch.neuralsearch.processor.NeuralQueryEnricherProcessor;
+import org.opensearch.neuralsearch.processor.NeuralSparseTwoPhaseProcessor;
 import org.opensearch.neuralsearch.processor.NormalizationProcessor;
 import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow;
 import org.opensearch.neuralsearch.processor.SparseEncodingProcessor;
@@ -112,18 +113,13 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
         clientAccessor = new MLCommonsClientAccessor(new MachineLearningNodeClient(parameters.client));
         return Map.of(
             TextEmbeddingProcessor.TYPE,
-            new TextEmbeddingProcessorFactory(clientAccessor, parameters.env),
+            new TextEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
             SparseEncodingProcessor.TYPE,
-            new SparseEncodingProcessorFactory(clientAccessor, parameters.env),
+            new SparseEncodingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
             TextImageEmbeddingProcessor.TYPE,
             new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
             TextChunkingProcessor.TYPE,
-            new TextChunkingProcessorFactory(
-                parameters.env,
-                parameters.ingestService.getClusterService(),
-                parameters.indicesService,
-                parameters.analysisRegistry
-            )
+            new TextChunkingProcessorFactory(parameters.env, parameters.ingestService.getClusterService(), parameters.analysisRegistry)
         );
     }
 
@@ -162,7 +158,12 @@ public List<Setting<?>> getSettings() {
     public Map<String, org.opensearch.search.pipeline.Processor.Factory<SearchRequestProcessor>> getRequestProcessors(
         Parameters parameters
     ) {
-        return Map.of(NeuralQueryEnricherProcessor.TYPE, new NeuralQueryEnricherProcessor.Factory());
+        return Map.of(
+            NeuralQueryEnricherProcessor.TYPE,
+            new NeuralQueryEnricherProcessor.Factory(),
+            NeuralSparseTwoPhaseProcessor.TYPE,
+            new NeuralSparseTwoPhaseProcessor.Factory()
+        );
     }
 
     @Override