From aa6822e005472f1700fec0fe9be10d4cff41f986 Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Fri, 17 Jan 2025 09:15:22 -0800 Subject: [PATCH] Initial native synthetic source for counted_keyword fields (#120078) Natively support synthetic source for the counted_keyword field type if the "synthetic_source_keep" mapping attribute is "none". Right now we don't have the logic set up to get the correct value of synthetic_source_keep if the value is inherited. Until we get that set up, we can only confidently use the doc_values implementation of synthetic_source if the synthetic_source_keep is explicitly set to "none" in the mapping parameters. --- .../index/mapper/MapperFeatures.java | 5 + .../CountedKeywordFieldMapper.java | 110 +++++++++++++++- .../CountedKeywordFieldMapperTests.java | 103 ++++++++++++++- .../counted_keyword/30_synthetic_source.yml | 123 ++++++++++++++++++ 4 files changed, 336 insertions(+), 5 deletions(-) create mode 100644 x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java index 1097c1f0ea16b..c423b878320df 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java @@ -32,6 +32,10 @@ public Set getFeatures() { "mapper.constant_keyword.synthetic_source_write_fix" ); + public static final NodeFeature COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT = new NodeFeature( + "mapper.counted_keyword.synthetic_source_native_support" + ); + public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed"); public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support"); @@ -49,6 +53,7 @@ public Set getTestFeatures() { CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX, META_FETCH_FIELDS_ERROR_CODE_CHANGED, SPARSE_VECTOR_STORE_SUPPORT, + COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT, SourceFieldMapper.SYNTHETIC_RECOVERY_SOURCE ); } diff --git a/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java b/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java index 3a50cc8143485..1a765ca06efbc 100644 --- a/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java +++ b/x-pack/plugin/mapper-counted-keyword/src/main/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapper.java @@ -13,11 +13,13 @@ import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.util.BigArrays; @@ -35,6 +37,7 @@ import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; import org.elasticsearch.index.mapper.StringFieldType; import org.elasticsearch.index.mapper.TextSearchInfo; @@ -46,6 +49,7 @@ import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; import org.elasticsearch.search.sort.BucketedSort; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; import java.io.IOException; @@ -72,7 +76,8 @@ * 2 for each key (one per document), a counted_terms aggregation on a counted_keyword field will consider * the actual count and report a count of 3 for each key.

* - *

Only regular source is supported; synthetic source won't work.

+ *

Synthetic source is supported, but uses the fallback "ignore source" infrastructure unless the source_keep_mode is + * explicitly set to none in the field mapping parameters.

*/ public class CountedKeywordFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "counted_keyword"; @@ -306,6 +311,81 @@ public FieldMapper build(MapperBuilderContext context) { } } + private static class CountedKeywordFieldSyntheticSourceLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader { + private final String keywordsFieldName; + private final String countsFieldName; + private final String leafName; + + private SortedSetDocValues keywordsReader; + private BinaryDocValues countsReader; + private boolean hasValue; + + CountedKeywordFieldSyntheticSourceLoader(String keywordsFieldName, String countsFieldName, String leafName) { + this.keywordsFieldName = keywordsFieldName; + this.countsFieldName = countsFieldName; + this.leafName = leafName; + } + + @Override + public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { + keywordsReader = leafReader.getSortedSetDocValues(keywordsFieldName); + countsReader = leafReader.getBinaryDocValues(countsFieldName); + + if (keywordsReader == null || countsReader == null) { + return null; + } + + return docId -> { + hasValue = keywordsReader.advanceExact(docId); + if (hasValue == false) { + return false; + } + + boolean countsHasValue = countsReader.advanceExact(docId); + assert countsHasValue; + + return true; + }; + } + + @Override + public boolean hasValue() { + return hasValue; + } + + @Override + public void write(XContentBuilder b) throws IOException { + if (hasValue == false) { + return; + } + + int[] counts = new BytesArray(countsReader.binaryValue()).streamInput().readVIntArray(); + boolean singleValue = counts.length == 1 && counts[0] == 1; + + if (singleValue) { + b.field(leafName); + } else { + b.startArray(leafName); + } + + for (int i = 0; i < keywordsReader.docValueCount(); i++) { + BytesRef currKeyword = keywordsReader.lookupOrd(keywordsReader.nextOrd()); + for (int j = 0; j < counts[i]; j++) { + b.utf8Value(currKeyword.bytes, currKeyword.offset, currKeyword.length); + } + } + + if (singleValue == false) { + b.endArray(); + } + } + + @Override + public String fieldName() { + return keywordsFieldName; + } + } + public static TypeParser PARSER = new TypeParser((n, c) -> new CountedKeywordFieldMapper.Builder(n)); private final FieldType fieldType; @@ -342,6 +422,11 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio } else { throw new IllegalArgumentException("Encountered unexpected token [" + parser.currentToken() + "]."); } + + if (values.isEmpty()) { + return; + } + int i = 0; int[] counts = new int[values.size()]; for (Map.Entry value : values.entrySet()) { @@ -355,13 +440,18 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio private void parseArray(DocumentParserContext context, SortedMap values) throws IOException { XContentParser parser = context.parser(); + int arrDepth = 1; while (true) { XContentParser.Token token = parser.nextToken(); if (token == XContentParser.Token.END_ARRAY) { - return; - } - if (token == XContentParser.Token.VALUE_STRING) { + arrDepth -= 1; + if (arrDepth <= 0) { + return; + } + } else if (token == XContentParser.Token.VALUE_STRING) { parseValue(parser, values); + } else if (token == XContentParser.Token.START_ARRAY) { + arrDepth += 1; } else if (token == XContentParser.Token.VALUE_NULL) { // ignore null values } else { @@ -399,4 +489,16 @@ public FieldMapper.Builder getMergeBuilder() { protected String contentType() { return CONTENT_TYPE; } + + @Override + protected SyntheticSourceSupport syntheticSourceSupport() { + var keepMode = sourceKeepMode(); + if (keepMode.isPresent() == false || keepMode.get() != SourceKeepMode.NONE) { + return super.syntheticSourceSupport(); + } + + var loader = new CountedKeywordFieldSyntheticSourceLoader(fullPath(), countFieldMapper.fullPath(), leafName()); + return new SyntheticSourceSupport.Native(loader); + } + } diff --git a/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java b/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java index 2ffd4468c814a..c99edcf7352fa 100644 --- a/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java +++ b/x-pack/plugin/mapper-counted-keyword/src/test/java/org/elasticsearch/xpack/countedkeyword/CountedKeywordFieldMapperTests.java @@ -10,11 +10,15 @@ import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; +import org.elasticsearch.core.CheckedConsumer; +import org.elasticsearch.core.Tuple; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperTestCase; import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.lookup.SourceFilter; +import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xcontent.XContentBuilder; import org.junit.AssumptionViolatedException; @@ -22,6 +26,9 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.equalTo; public class CountedKeywordFieldMapperTests extends MapperTestCase { @Override @@ -64,9 +71,103 @@ protected Object generateRandomInputValue(MappedFieldType ft) { return randomBoolean() ? null : randomAlphaOfLengthBetween(1, 10); } + public void testSyntheticSourceSingleNullValue() throws IOException { + DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> { + b.startObject("field"); + minimalMapping(b); + b.field("synthetic_source_keep", "none"); + b.endObject(); + })).documentMapper(); + + String expected = "{}"; + CheckedConsumer buildInput = b -> { + b.field("field"); + b.nullValue(); + }; + + assertThat(syntheticSource(mapper, buildInput), equalTo(expected)); + assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected)); + assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}")); + } + + public void testSyntheticSourceManyNullValue() throws IOException { + DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> { + b.startObject("field"); + minimalMapping(b); + b.field("synthetic_source_keep", "none"); + b.endObject(); + })).documentMapper(); + + int nullCount = randomIntBetween(1, 5); + + String expected = "{}"; + CheckedConsumer buildInput = b -> { + b.startArray("field"); + for (int i = 0; i < nullCount; i++) { + b.nullValue(); + } + b.endArray(); + }; + + assertThat(syntheticSource(mapper, buildInput), equalTo(expected)); + assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected)); + assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}")); + } + + @Override + public void testSyntheticSourceKeepAll() throws IOException { + // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none" + } + + @Override + public void testSyntheticSourceKeepArrays() throws IOException { + // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none" + } + + @Override + public void testSyntheticSourceKeepNone() throws IOException { + // For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none" + } + @Override protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { - throw new AssumptionViolatedException("not supported"); + return new SyntheticSourceSupport() { + @Override + public SyntheticSourceExample example(int maxValues) throws IOException { + if (randomBoolean()) { + Tuple v = generateValue(); + return new SyntheticSourceExample(v.v1(), v.v2(), this::mapping); + } + int maxNullValues = 5; + List> values = randomList(1, maxValues, this::generateValue); + List in = Stream.concat(values.stream().map(Tuple::v1), randomList(0, maxNullValues, () -> (String) null).stream()) + .toList(); + + in = shuffledList(in); + + List outList = values.stream().map(Tuple::v2).sorted().toList(); + + Object out = outList.size() == 1 ? outList.get(0) : outList; + return new SyntheticSourceExample(in, out, this::mapping); + } + + private Tuple generateValue() { + String v = ESTestCase.randomAlphaOfLength(5); + return Tuple.tuple(v, v); + } + + private void mapping(XContentBuilder b) throws IOException { + minimalMapping(b); + // For now, synthetic source is only supported when "synthetic_source_keep" is "none". + // Once we implement true synthetic source support, we should remove this. + b.field("synthetic_source_keep", "none"); + } + + @Override + public List invalidExample() throws IOException { + return List.of(); + } + }; } @Override diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml new file mode 100644 index 0000000000000..7ade369893f4b --- /dev/null +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/counted_keyword/30_synthetic_source.yml @@ -0,0 +1,123 @@ +setup: + - requires: + cluster_features: ["mapper.counted_keyword.synthetic_source_native_support"] + reason: "Feature implemented" + + - do: + indices.create: + index: test-events + body: + settings: + index: + mapping.source.mode: synthetic + mappings: + properties: + events: + type: counted_keyword + synthetic_source_keep: none + + + - do: + index: + index: test-events + id: "1" + body: { "events": [ "a", "b", "a", "c" ] } + + - do: + index: + index: test-events + id: "2" + body: { "events": ["b", "b", "c", "a", "b"] } + + - do: + index: + index: test-events + id: "3" + body: { "events": ["c", "a", null, "b", null, "c"]} + + - do: + index: + index: test-events + id: "4" + body: { "events": ["a"]} + + - do: + index: + index: test-events + id: "5" + body: { "events": []} + + - do: + index: + index: test-events + id: "6" + body: { "events": [null, null]} + + - do: + indices.refresh: { } + +--- +"Source values are mutated as expected": + - do: + search: + index: test-events + body: + query: + ids: + values: [1] + - match: + hits.hits.0._source: + events: ["a", "a", "b", "c"] + + - do: + search: + index: test-events + body: + query: + ids: + values: [2] + - match: + hits.hits.0._source: + events: ["a", "b", "b", "b", "c"] + + - do: + search: + index: test-events + body: + query: + ids: + values: [3] + - match: + hits.hits.0._source: + events: ["a", "b", "c", "c"] + + - do: + search: + index: test-events + body: + query: + ids: + values: [4] + - match: + hits.hits.0._source: + events: "a" + + - do: + search: + index: test-events + body: + query: + ids: + values: [5] + - match: + hits.hits.0._source: {} + + - do: + search: + index: test-events + body: + query: + ids: + values: [6] + - match: + hits.hits.0._source: {}