Skip to content

Commit

Permalink
Initial native synthetic source for counted_keyword fields (elastic#1…
Browse files Browse the repository at this point in the history
…20078)

Natively support synthetic source for the counted_keyword field type
if the "synthetic_source_keep" mapping attribute is "none".

Right now we don't have the logic set up to get the correct value of
synthetic_source_keep if the value is inherited.

Until we get that set up, we can only confidently use the doc_values
implementation of synthetic_source if the synthetic_source_keep is
explicitly set to "none" in the mapping parameters.
  • Loading branch information
jordan-powers authored Jan 17, 2025
1 parent 06e1621 commit aa6822e
Show file tree
Hide file tree
Showing 4 changed files with 336 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public Set<NodeFeature> getFeatures() {
"mapper.constant_keyword.synthetic_source_write_fix"
);

public static final NodeFeature COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT = new NodeFeature(
"mapper.counted_keyword.synthetic_source_native_support"
);

public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");

Expand All @@ -49,6 +53,7 @@ public Set<NodeFeature> getTestFeatures() {
CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
META_FETCH_FIELDS_ERROR_CODE_CHANGED,
SPARSE_VECTOR_STORE_SUPPORT,
COUNTED_KEYWORD_SYNTHETIC_SOURCE_NATIVE_SUPPORT,
SourceFieldMapper.SYNTHETIC_RECOVERY_SOURCE
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.util.BigArrays;
Expand All @@ -35,6 +37,7 @@
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
Expand All @@ -46,6 +49,7 @@
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.sort.BucketedSort;
import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;

import java.io.IOException;
Expand All @@ -72,7 +76,8 @@
* 2 for each key (one per document), a <code>counted_terms</code> aggregation on a <code>counted_keyword</code> field will consider
* the actual count and report a count of 3 for each key.</p>
*
* <p>Only regular source is supported; synthetic source won't work.</p>
* <p>Synthetic source is supported, but uses the fallback "ignore source" infrastructure unless the <code>source_keep_mode</code> is
* explicitly set to <code>none</code> in the field mapping parameters.</p>
*/
public class CountedKeywordFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "counted_keyword";
Expand Down Expand Up @@ -306,6 +311,81 @@ public FieldMapper build(MapperBuilderContext context) {
}
}

private static class CountedKeywordFieldSyntheticSourceLoader extends SourceLoader.DocValuesBasedSyntheticFieldLoader {
private final String keywordsFieldName;
private final String countsFieldName;
private final String leafName;

private SortedSetDocValues keywordsReader;
private BinaryDocValues countsReader;
private boolean hasValue;

CountedKeywordFieldSyntheticSourceLoader(String keywordsFieldName, String countsFieldName, String leafName) {
this.keywordsFieldName = keywordsFieldName;
this.countsFieldName = countsFieldName;
this.leafName = leafName;
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
keywordsReader = leafReader.getSortedSetDocValues(keywordsFieldName);
countsReader = leafReader.getBinaryDocValues(countsFieldName);

if (keywordsReader == null || countsReader == null) {
return null;
}

return docId -> {
hasValue = keywordsReader.advanceExact(docId);
if (hasValue == false) {
return false;
}

boolean countsHasValue = countsReader.advanceExact(docId);
assert countsHasValue;

return true;
};
}

@Override
public boolean hasValue() {
return hasValue;
}

@Override
public void write(XContentBuilder b) throws IOException {
if (hasValue == false) {
return;
}

int[] counts = new BytesArray(countsReader.binaryValue()).streamInput().readVIntArray();
boolean singleValue = counts.length == 1 && counts[0] == 1;

if (singleValue) {
b.field(leafName);
} else {
b.startArray(leafName);
}

for (int i = 0; i < keywordsReader.docValueCount(); i++) {
BytesRef currKeyword = keywordsReader.lookupOrd(keywordsReader.nextOrd());
for (int j = 0; j < counts[i]; j++) {
b.utf8Value(currKeyword.bytes, currKeyword.offset, currKeyword.length);
}
}

if (singleValue == false) {
b.endArray();
}
}

@Override
public String fieldName() {
return keywordsFieldName;
}
}

public static TypeParser PARSER = new TypeParser((n, c) -> new CountedKeywordFieldMapper.Builder(n));

private final FieldType fieldType;
Expand Down Expand Up @@ -342,6 +422,11 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
} else {
throw new IllegalArgumentException("Encountered unexpected token [" + parser.currentToken() + "].");
}

if (values.isEmpty()) {
return;
}

int i = 0;
int[] counts = new int[values.size()];
for (Map.Entry<String, Integer> value : values.entrySet()) {
Expand All @@ -355,13 +440,18 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio

private void parseArray(DocumentParserContext context, SortedMap<String, Integer> values) throws IOException {
XContentParser parser = context.parser();
int arrDepth = 1;
while (true) {
XContentParser.Token token = parser.nextToken();
if (token == XContentParser.Token.END_ARRAY) {
return;
}
if (token == XContentParser.Token.VALUE_STRING) {
arrDepth -= 1;
if (arrDepth <= 0) {
return;
}
} else if (token == XContentParser.Token.VALUE_STRING) {
parseValue(parser, values);
} else if (token == XContentParser.Token.START_ARRAY) {
arrDepth += 1;
} else if (token == XContentParser.Token.VALUE_NULL) {
// ignore null values
} else {
Expand Down Expand Up @@ -399,4 +489,16 @@ public FieldMapper.Builder getMergeBuilder() {
protected String contentType() {
return CONTENT_TYPE;
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
var keepMode = sourceKeepMode();
if (keepMode.isPresent() == false || keepMode.get() != SourceKeepMode.NONE) {
return super.syntheticSourceSupport();
}

var loader = new CountedKeywordFieldSyntheticSourceLoader(fullPath(), countFieldMapper.fullPath(), leafName());
return new SyntheticSourceSupport.Native(loader);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,25 @@
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.lookup.SourceFilter;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xcontent.XContentBuilder;
import org.junit.AssumptionViolatedException;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Stream;

import static org.hamcrest.Matchers.equalTo;

public class CountedKeywordFieldMapperTests extends MapperTestCase {
@Override
Expand Down Expand Up @@ -64,9 +71,103 @@ protected Object generateRandomInputValue(MappedFieldType ft) {
return randomBoolean() ? null : randomAlphaOfLengthBetween(1, 10);
}

public void testSyntheticSourceSingleNullValue() throws IOException {
DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> {
b.startObject("field");
minimalMapping(b);
b.field("synthetic_source_keep", "none");
b.endObject();
})).documentMapper();

String expected = "{}";
CheckedConsumer<XContentBuilder, IOException> buildInput = b -> {
b.field("field");
b.nullValue();
};

assertThat(syntheticSource(mapper, buildInput), equalTo(expected));
assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected));
assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}"));
}

public void testSyntheticSourceManyNullValue() throws IOException {
DocumentMapper mapper = createSytheticSourceMapperService(mapping(b -> {
b.startObject("field");
minimalMapping(b);
b.field("synthetic_source_keep", "none");
b.endObject();
})).documentMapper();

int nullCount = randomIntBetween(1, 5);

String expected = "{}";
CheckedConsumer<XContentBuilder, IOException> buildInput = b -> {
b.startArray("field");
for (int i = 0; i < nullCount; i++) {
b.nullValue();
}
b.endArray();
};

assertThat(syntheticSource(mapper, buildInput), equalTo(expected));
assertThat(syntheticSource(mapper, new SourceFilter(new String[] { "field" }, null), buildInput), equalTo(expected));
assertThat(syntheticSource(mapper, new SourceFilter(null, new String[] { "field" }), buildInput), equalTo("{}"));
}

@Override
public void testSyntheticSourceKeepAll() throws IOException {
// For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
}

@Override
public void testSyntheticSourceKeepArrays() throws IOException {
// For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
}

@Override
public void testSyntheticSourceKeepNone() throws IOException {
// For now, native synthetic source is only supported when "synthetic_source_keep" mapping attribute is "none"
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) {
throw new AssumptionViolatedException("not supported");
return new SyntheticSourceSupport() {
@Override
public SyntheticSourceExample example(int maxValues) throws IOException {
if (randomBoolean()) {
Tuple<String, String> v = generateValue();
return new SyntheticSourceExample(v.v1(), v.v2(), this::mapping);
}
int maxNullValues = 5;
List<Tuple<String, String>> values = randomList(1, maxValues, this::generateValue);
List<String> in = Stream.concat(values.stream().map(Tuple::v1), randomList(0, maxNullValues, () -> (String) null).stream())
.toList();

in = shuffledList(in);

List<String> outList = values.stream().map(Tuple::v2).sorted().toList();

Object out = outList.size() == 1 ? outList.get(0) : outList;
return new SyntheticSourceExample(in, out, this::mapping);
}

private Tuple<String, String> generateValue() {
String v = ESTestCase.randomAlphaOfLength(5);
return Tuple.tuple(v, v);
}

private void mapping(XContentBuilder b) throws IOException {
minimalMapping(b);
// For now, synthetic source is only supported when "synthetic_source_keep" is "none".
// Once we implement true synthetic source support, we should remove this.
b.field("synthetic_source_keep", "none");
}

@Override
public List<SyntheticSourceInvalidExample> invalidExample() throws IOException {
return List.of();
}
};
}

@Override
Expand Down
Loading

0 comments on commit aa6822e

Please sign in to comment.