Skip to content

Commit

Permalink
Synthetic source doc values arrays encoding experiment
Browse files Browse the repository at this point in the history
The keyword doc values field gets an extra binary doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values.

This is stored in an offset to ordinal array that gets vint encoded into the binary doc values field. The additional storage required for this will likely be minimized with elastic#112416 (zstd compression for binary doc values)

In case of the following string array for a keyword field: ["c", "b", "a", "c"].
Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2]

Limitations:
* only support for keyword field mapper.
* multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c]
* empty arrays ([]) are not recorded
* arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"].

These limitations can be addressed, but some require more complexity and or additional storage.
  • Loading branch information
martijnvg committed Sep 29, 2024
1 parent 0cdcc8c commit 1499dbd
Show file tree
Hide file tree
Showing 9 changed files with 224 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ subobjects auto:
- match: { hits.hits.0._source.foo: 10 }
- match: { hits.hits.0._source.foo\.bar: 100 }
- match: { hits.hits.0._source.regular\.span\.id: "1" }
- match: { hits.hits.0._source.regular\.trace\.id: [ "a", "b" ] }
- match: { hits.hits.0._source.regular\.trace\.id: ["b", "a", "b" ] }
- match: { hits.hits.1._source.id: 2 }
- match: { hits.hits.1._source.foo: 20 }
- match: { hits.hits.1._source.foo\.bar: 200 }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ field param - keep root array:
index: test
sort: id

- match: { hits.hits.0._source.kw_default: [ "A", "B" ] }
- match: { hits.hits.0._source.kw_default: [ "B", "A" ] }
- match: { hits.hits.0._source.kw_arrays: [ "B", "A" ] }
- match: { hits.hits.0._source.kw_all: [ "B", "A" ] }

Expand Down Expand Up @@ -513,7 +513,7 @@ field param - keep nested array:
sort: id

- match: { hits.hits.0._source.path.to.ratio: 10.0 }
- match: { hits.hits.0._source.path.to.kw_default: [ "A", "B" ] }
- match: { hits.hits.0._source.path.to.kw_default: [ "B", "A" ] }
- match: { hits.hits.0._source.path.to.kw_arrays: [ "B", "A" ] }
- match: { hits.hits.0._source.path.to.kw_all: [ "B", "A" ] }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ private static void parseNonDynamicArray(
String fullPath = context.path().pathAsText(arrayFieldName);

// Check if we need to record the array source. This only applies to synthetic source.
if (context.canAddIgnoredField()) {
if (context.canAddIgnoredField() && (mapper instanceof KeywordFieldMapper == false /* hard coded exclusion keyword mapper */)) {
boolean objectRequiresStoringSource = mapper instanceof ObjectMapper objectMapper
&& (objectMapper.storeArraySource()
|| (context.sourceKeepModeFromIndexSettings() == Mapper.SourceKeepMode.ARRAYS
Expand All @@ -834,10 +834,13 @@ private static void parseNonDynamicArray(
}
}

// In synthetic source, if any array element requires storing its source as-is, it takes precedence over
// elements from regular source loading that are then skipped from the synthesized array source.
// To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
context = context.cloneForArray(fullPath);
// hard coded exclusion keyword mapper:
if (mapper instanceof KeywordFieldMapper == false) {
// In synthetic source, if any array element requires storing its source as-is, it takes precedence over
// elements from regular source loading that are then skipped from the synthesized array source.
// To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
context = context.cloneForArray(fullPath);
}

XContentParser parser = context.parser();
XContentParser.Token token;
Expand All @@ -855,6 +858,10 @@ private static void parseNonDynamicArray(
parseValue(context, lastFieldName);
}
}
// hard coded post processing of arrays:
if (mapper instanceof KeywordFieldMapper k) {
k.processOffsets(context);
}
postProcessDynamicArrayMapping(context, lastFieldName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

/**
* Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as
Expand Down Expand Up @@ -918,4 +921,19 @@ public String currentName() throws IOException {
return field;
}
}

private final Map<String, SortedMap<String, List<Integer>>> arrayOffsetsByField = new HashMap<>();
private final Map<String, Integer> numValuesByField = new HashMap<>();

public SortedMap<String, List<Integer>> getOffsets(String field) {
return arrayOffsetsByField.computeIfAbsent(field , s -> new TreeMap<>(Comparator.naturalOrder()));
}

public void recordOffset(String fieldName, String value) {
int count = numValuesByField.compute(fieldName, (s, integer) -> integer == null ? 0 : ++integer);
var values = arrayOffsetsByField.computeIfAbsent(fieldName , s -> new TreeMap<>(Comparator.naturalOrder()));
var offsets = values.computeIfAbsent(value, s -> new ArrayList<>());
offsets.add(count);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.InvertableType;
Expand All @@ -34,6 +35,7 @@
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
Expand Down Expand Up @@ -65,17 +67,23 @@
import org.elasticsearch.search.runtime.StringScriptFieldTermQuery;
import org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import static org.apache.lucene.index.IndexWriter.MAX_TERM_LENGTH;
import static org.elasticsearch.core.Strings.format;
Expand All @@ -89,6 +97,7 @@ public final class KeywordFieldMapper extends FieldMapper {
private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);

public static final String CONTENT_TYPE = "keyword";
public static final String OFFSETS_FIELD_NAME_SUFFIX = "_offsets";

static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above");
static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source");
Expand Down Expand Up @@ -375,13 +384,26 @@ public KeywordFieldMapper build(MapperBuilderContext context) {
}
super.hasScript = script.get() != null;
super.onScriptError = onScriptError.getValue();

BinaryFieldMapper offsetsFieldMapper;
if (context.isSourceSynthetic() && fieldtype.stored() == false) {
// keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing
// (if field is stored then there is no point of doing this)
offsetsFieldMapper = new BinaryFieldMapper.Builder(leafName() + OFFSETS_FIELD_NAME_SUFFIX, context.isSourceSynthetic())
.docValues(true)
.build(context);
} else {
offsetsFieldMapper = null;
}

return new KeywordFieldMapper(
leafName(),
fieldtype,
buildFieldType(context, fieldtype),
builderParams(this, context),
context.isSourceSynthetic(),
this
this,
offsetsFieldMapper
);
}
}
Expand Down Expand Up @@ -882,14 +904,16 @@ public boolean hasNormalizer() {
private final IndexAnalyzers indexAnalyzers;
private final int ignoreAboveDefault;
private final int ignoreAbove;
private final BinaryFieldMapper offsetsFieldMapper;

private KeywordFieldMapper(
String simpleName,
FieldType fieldType,
KeywordFieldType mappedFieldType,
BuilderParams builderParams,
boolean isSyntheticSource,
Builder builder
Builder builder,
BinaryFieldMapper offsetsFieldMapper
) {
super(simpleName, mappedFieldType, builderParams);
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
Expand All @@ -906,17 +930,57 @@ private KeywordFieldMapper(
this.isSyntheticSource = isSyntheticSource;
this.ignoreAboveDefault = builder.ignoreAboveDefault;
this.ignoreAbove = builder.ignoreAbove.getValue();
this.offsetsFieldMapper = offsetsFieldMapper;
}

@Override
public KeywordFieldType fieldType() {
return (KeywordFieldType) super.fieldType();
}

@Override
protected void parseCreateField(DocumentParserContext context) throws IOException {
final String value = context.parser().textOrNull();
indexValue(context, value == null ? fieldType().nullValue : value);
String value = context.parser().textOrNull();
if (value == null) {
value = fieldType().nullValue;
}
boolean indexed = indexValue(context, value);
if (offsetsFieldMapper != null && indexed) {
context.recordOffset(fullPath(), value);
}
}

public void processOffsets(DocumentParserContext context) throws IOException {
var values = context.getOffsets(fullPath());
if (values.isEmpty()) {
return;
}

int arrayLength = 0;
for (List<Integer> offsets : values.values()) {
for (Integer offset : offsets) {
arrayLength = Math.max(arrayLength, offset);
}
}
arrayLength += 1;

int ord = 0;
int[] offsetToOrd = new int[arrayLength];
for (var entry : values.entrySet()) {
for (var offsetAndLevel : entry.getValue()) {
offsetToOrd[offsetAndLevel] = ord;
}
ord++;
}

logger.info("values=" + values);
logger.info("offsetToOrd=" + Arrays.toString(offsetToOrd));

try (var streamOutput = new BytesStreamOutput()) {
// TODO: optimize
// This array allows to retain the original ordering of the leaf array and duplicate values.
streamOutput.writeVIntArray(offsetToOrd);
context.doc().add(new BinaryDocValuesField(offsetsFieldMapper.fullPath(), streamOutput.bytes().toBytesRef()));
}
}

@Override
Expand All @@ -929,13 +993,13 @@ protected void indexScriptValues(
this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
}

private void indexValue(DocumentParserContext context, String value) {
private boolean indexValue(DocumentParserContext context, String value) {
if (value == null) {
return;
return false;
}
// if field is disabled, skip indexing
if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) {
return;
return false;
}

if (value.length() > fieldType().ignoreAbove()) {
Expand All @@ -944,7 +1008,7 @@ private void indexValue(DocumentParserContext context, String value) {
// Save a copy of the field so synthetic source can load it
context.doc().add(new StoredField(originalName(), new BytesRef(value)));
}
return;
return false;
}

value = normalizeValue(fieldType().normalizer(), fullPath(), value);
Expand Down Expand Up @@ -982,6 +1046,8 @@ private void indexValue(DocumentParserContext context, String value) {
if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
context.addToFieldNames(fieldType().name());
}

return true;
}

private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
Expand Down Expand Up @@ -1078,7 +1144,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
}
});
} else if (hasDocValues) {
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) {
String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null;
layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) {

@Override
protected BytesRef convert(BytesRef value) {
Expand Down
Loading

0 comments on commit 1499dbd

Please sign in to comment.