From aa6822e005472f1700fec0fe9be10d4cff41f986 Mon Sep 17 00:00:00 2001
From: Jordan Powers
Date: Fri, 17 Jan 2025 09:15:22 -0800
Subject: [PATCH] Initial native synthetic source for counted_keyword fields
(#120078)
Natively support synthetic source for the counted_keyword field type
if the "synthetic_source_keep" mapping attribute is "none".
Right now we don't have the logic set up to get the correct value of
synthetic_source_keep if the value is inherited.
Until we get that set up, we can only confidently use the doc_values
implementation of synthetic_source if the synthetic_source_keep is
explicitly set to "none" in the mapping parameters.
---
.../index/mapper/MapperFeatures.java | 5 +
.../CountedKeywordFieldMapper.java | 110 +++++++++++++++-
.../CountedKeywordFieldMapperTests.java | 103 ++++++++++++++-
.../counted_keyword/30_synthetic_source.yml | 123 ++++++++++++++++++
4 files changed, 336 insertions(+), 5 deletions(-)
create mode 100644 x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java
index 1097c1f0ea16b..c423b878320df 100644
--- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java
+++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java
@@ -32,6 +32,10 @@ public Set getFeatures() {
"mapper.constant_keyword.synthetic_source_write_fix"
);
+ public static final NodeFeature COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT = new NodeFeature(
+ "mapper.counted_keyword.synthetic_source_native_support"
+ );
+
public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");
@@ -49,6 +53,7 @@ public Set getTestFeatures() {
CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
META_FETCH_FIELDS_ERROR_CODE_CHANGED,
SPARSE_VECTOR_STORE_SUPPORT,
+ COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT,
SourceFieldMapper.SYNTHETIC_RECOVERY_SOURCE
);
}
diff --git a/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java b/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java
index 3a50cc8143485..1a765ca06efbc 100644
--- a/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java
+++ b/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java
@@ -13,11 +13,13 @@
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.util.BigArrays;
@@ -35,6 +37,7 @@
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperBuilderContext;
+import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
@@ -46,6 +49,7 @@
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.sort.BucketedSort;
import org.elasticsearch.search.sort.SortOrder;
+import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import java.io.IOException;
@@ -72,7 +76,8 @@
* 2 for each key (one per document), a counted_terms
aggregation on a counted_keyword
field will consider
* the actual count and report a count of 3 for each key.
*
- * Only regular source is supported; synthetic source won't work.
+ * Synthetic source is supported, but uses the fallback "ignore source" infrastructure unless the source_keep_mode
is
+ * explicitly set to none
in the field mapping parameters.
*/
public class CountedKeywordFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "counted_keyword";
@@ -306,6 +311,81 @@ public FieldMapper build(MapperBuilderContext context) {
}
}
+ private static class CountedKeywordFieldSyntheticSourceLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader {
+ private final String keywordsFieldName;
+ private final String countsFieldName;
+ private final String leafName;
+
+ private SortedSetDocValues keywordsReader;
+ private BinaryDocValues countsReader;
+ private boolean hasValue;
+
+ CountedKeywordFieldSyntheticSourceLoader(String keywordsFieldName, String countsFieldName, String leafName) {
+ this.keywordsFieldName = keywordsFieldName;
+ this.countsFieldName = countsFieldName;
+ this.leafName = leafName;
+ }
+
+ @Override
+ public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+ keywordsReader = leafReader.getSortedSetDocValues(keywordsFieldName);
+ countsReader = leafReader.getBinaryDocValues(countsFieldName);
+
+ if (keywordsReader == null || countsReader == null) {
+ return null;
+ }
+
+ return docId -> {
+ hasValue = keywordsReader.advanceExact(docId);
+ if (hasValue == false) {
+ return false;
+ }
+
+ boolean countsHasValue = countsReader.advanceExact(docId);
+ assert countsHasValue;
+
+ return true;
+ };
+ }
+
+ @Override
+ public boolean hasValue() {
+ return hasValue;
+ }
+
+ @Override
+ public void write(XContentBuilder b) throws IOException {
+ if (hasValue == false) {
+ return;
+ }
+
+ int[] counts = new BytesArray(countsReader.binaryValue()).streamInput().readVIntArray();
+ boolean singleValue = counts.length == 1 && counts[0] == 1;
+
+ if (singleValue) {
+ b.field(leafName);
+ } else {
+ b.startArray(leafName);
+ }
+
+ for (int i = 0; i < keywordsReader.docValueCount(); i++) {
+ BytesRef currKeyword = keywordsReader.lookupOrd(keywordsReader.nextOrd());
+ for (int j = 0; j < counts[i]; j++) {
+ b.utf8Value(currKeyword.bytes, currKeyword.offset, currKeyword.length);
+ }
+ }
+
+ if (singleValue == false) {
+ b.endArray();
+ }
+ }
+
+ @Override
+ public String fieldName() {
+ return keywordsFieldName;
+ }
+ }
+
public static TypeParser PARSER = new TypeParser((n, c) -> new CountedKeywordFieldMapper.Builder(n));
private final FieldType fieldType;
@@ -342,6 +422,11 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
} else {
throw new IllegalArgumentException("Encountered unexpected token [" + parser.currentToken() + "].");
}
+
+ if (values.isEmpty()) {
+ return;
+ }
+
int i = 0;
int[] counts = new int[values.size()];
for (Map.Entry value : values.entrySet()) {
@@ -355,13 +440,18 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
private void parseArray(DocumentParserContext context, SortedMap values) throws IOException {
XContentParser parser = context.parser();
+ int arrDepth = 1;
while (true) {
XContentParser.Token token = parser.nextToken();
if (token == XContentParser.Token.END_ARRAY) {
- return;
- }
- if (token == XContentParser.Token.VALUE_STRING) {
+ arrDepth -= 1;
+ if (arrDepth <= 0) {
+ return;
+ }
+ } else if (token == XContentParser.Token.VALUE_STRING) {
parseValue(parser, values);
+ } else if (token == XContentParser.Token.START_ARRAY) {
+ arrDepth += 1;
} else if (token == XContentParser.Token.VALUE_NULL) {
// ignore null values
} else {
@@ -399,4 +489,16 @@ public FieldMapper.Builder getMergeBuilder() {
protected String contentType() {
return CONTENT_TYPE;
}
+
+ @Override
+ protected SyntheticSourceSupport syntheticSourceSupport() {
+ var keepMode = sourceKeepMode();
+ if (keepMode.isPresent() == false || keepMode.get() != SourceKeepMode.NONE) {
+ return super.syntheticSourceSupport();
+ }
+
+ var loader = new CountedKeywordFieldSyntheticSourceLoader(fullPath(), countFieldMapper.fullPath(), leafName());
+ return new SyntheticSourceSupport.Native(loader);
+ }
+
}
diff --git a/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java b/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java
index 2ffd4468c814a..c99edcf7352fa 100644
--- a/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java
+++ b/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java
@@ -10,11 +10,15 @@
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
+import org.elasticsearch.core.CheckedConsumer;
+import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.search.lookup.SourceFilter;
+import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xcontent.XContentBuilder;
import org.junit.AssumptionViolatedException;
@@ -22,6 +26,9 @@
import java.util.Collection;
import java.util.Collections;
import java.util.List;
+import java.util.stream.Stream;
+
+import static org.hamcrest.Matchers.equalTo;
public class CountedKeywordFieldMapperTests extends MapperTestCase {
@Override
@@ -64,9 +71,103 @@ protected Object generateRandomInputValue(MappedFieldType ft) {
return randomBoolean() ? null : randomAlphaOfLengthBetween(1, 10);
}
+ public void testSyntheticSourceSingleNullValue() throws IOException {
+ DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> {
+ b.startObject("field");
+ minimalMapping(b);
+ b.field("synthetic_source_keep", "none");
+ b.endObject();
+ })).documentMapper();
+
+ String expected = "{}";
+ CheckedConsumer buildInput = b -> {
+ b.field("field");
+ b.nullValue();
+ };
+
+ assertThat(syntheticSource(mapper, buildInput), equalTo(expected));
+ assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected));
+ assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}"));
+ }
+
+ public void testSyntheticSourceManyNullValue() throws IOException {
+ DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> {
+ b.startObject("field");
+ minimalMapping(b);
+ b.field("synthetic_source_keep", "none");
+ b.endObject();
+ })).documentMapper();
+
+ int nullCount = randomIntBetween(1, 5);
+
+ String expected = "{}";
+ CheckedConsumer buildInput = b -> {
+ b.startArray("field");
+ for (int i = 0; i < nullCount; i++) {
+ b.nullValue();
+ }
+ b.endArray();
+ };
+
+ assertThat(syntheticSource(mapper, buildInput), equalTo(expected));
+ assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected));
+ assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}"));
+ }
+
+ @Override
+ public void testSyntheticSourceKeepAll() throws IOException {
+ // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
+ }
+
+ @Override
+ public void testSyntheticSourceKeepArrays() throws IOException {
+ // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
+ }
+
+ @Override
+ public void testSyntheticSourceKeepNone() throws IOException {
+ // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
+ }
+
@Override
protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) {
- throw new AssumptionViolatedException("not supported");
+ return new SyntheticSourceSupport() {
+ @Override
+ public SyntheticSourceExample example(int maxValues) throws IOException {
+ if (randomBoolean()) {
+ Tuple v = generateValue();
+ return new SyntheticSourceExample(v.v1(), v.v2(), this::mapping);
+ }
+ int maxNullValues = 5;
+ List> values = randomList(1, maxValues, this::generateValue);
+ List in = Stream.concat(values.stream().map(Tuple::v1), randomList(0, maxNullValues, () -> (String) null).stream())
+ .toList();
+
+ in = shuffledList(in);
+
+ List outList = values.stream().map(Tuple::v2).sorted().toList();
+
+ Object out = outList.size() == 1 ? outList.get(0) : outList;
+ return new SyntheticSourceExample(in, out, this::mapping);
+ }
+
+ private Tuple generateValue() {
+ String v = ESTestCase.randomAlphaOfLength(5);
+ return Tuple.tuple(v, v);
+ }
+
+ private void mapping(XContentBuilder b) throws IOException {
+ minimalMapping(b);
+ // For now, synthetic source is only supported when "synthetic_source_keep" is "none".
+ // Once we implement true synthetic source support, we should remove this.
+ b.field("synthetic_source_keep", "none");
+ }
+
+ @Override
+ public List invalidExample() throws IOException {
+ return List.of();
+ }
+ };
}
@Override
diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml
new file mode 100644
index 0000000000000..7ade369893f4b
--- /dev/null
+++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml
@@ -0,0 +1,123 @@
+setup:
+ - requires:
+ cluster_features: ["mapper.counted_keyword.synthetic_source_native_support"]
+ reason: "Feature implemented"
+
+ - do:
+ indices.create:
+ index: test-events
+ body:
+ settings:
+ index:
+ mapping.source.mode: synthetic
+ mappings:
+ properties:
+ events:
+ type: counted_keyword
+ synthetic_source_keep: none
+
+
+ - do:
+ index:
+ index: test-events
+ id: "1"
+ body: { "events": [ "a", "b", "a", "c" ] }
+
+ - do:
+ index:
+ index: test-events
+ id: "2"
+ body: { "events": ["b", "b", "c", "a", "b"] }
+
+ - do:
+ index:
+ index: test-events
+ id: "3"
+ body: { "events": ["c", "a", null, "b", null, "c"]}
+
+ - do:
+ index:
+ index: test-events
+ id: "4"
+ body: { "events": ["a"]}
+
+ - do:
+ index:
+ index: test-events
+ id: "5"
+ body: { "events": []}
+
+ - do:
+ index:
+ index: test-events
+ id: "6"
+ body: { "events": [null, null]}
+
+ - do:
+ indices.refresh: { }
+
+---
+"Source values are mutated as expected":
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [1]
+ - match:
+ hits.hits.0._source:
+ events: ["a", "a", "b", "c"]
+
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [2]
+ - match:
+ hits.hits.0._source:
+ events: ["a", "b", "b", "b", "c"]
+
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [3]
+ - match:
+ hits.hits.0._source:
+ events: ["a", "b", "c", "c"]
+
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [4]
+ - match:
+ hits.hits.0._source:
+ events: "a"
+
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [5]
+ - match:
+ hits.hits.0._source: {}
+
+ - do:
+ search:
+ index: test-events
+ body:
+ query:
+ ids:
+ values: [6]
+ - match:
+ hits.hits.0._source: {}