Synthetic source doc values arrays encoding experiment

The keyword doc values field gets an extra binary doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets vint encoded into the binary doc values field. The additional storage required for this will likely be minimized with elastic#112416 (zstd compression for binary doc values) In case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Limitations: * only support for keyword field mapper. * multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] * empty arrays ([]) are not recorded * arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage.
martijnvg · Sep 29, 2024 · 1499dbd · 1499dbd
1 parent 0cdcc8c
commit 1499dbd
Show file tree

Hide file tree

Showing 9 changed files with 224 additions and 27 deletions.
diff --git a/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml
@@ -965,7 +965,7 @@ subobjects auto:
   - match: { hits.hits.0._source.foo: 10  }
   - match: { hits.hits.0._source.foo\.bar: 100  }
   - match: { hits.hits.0._source.regular\.span\.id: "1" }
-  - match: { hits.hits.0._source.regular\.trace\.id: [ "a", "b" ] }
+  - match: { hits.hits.0._source.regular\.trace\.id: ["b", "a", "b" ] }
   - match: { hits.hits.1._source.id: 2  }
   - match: { hits.hits.1._source.foo: 20 }
   - match: { hits.hits.1._source.foo\.bar: 200 }

diff --git a/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml
@@ -447,7 +447,7 @@ field param - keep root array:
         index: test
         sort: id
 
-  - match: { hits.hits.0._source.kw_default: [ "A", "B" ] }
+  - match: { hits.hits.0._source.kw_default: [ "B", "A" ] }
   - match: { hits.hits.0._source.kw_arrays: [ "B", "A" ] }
   - match: { hits.hits.0._source.kw_all: [ "B", "A" ] }
 
@@ -513,7 +513,7 @@ field param - keep nested array:
         sort: id
 
   - match: { hits.hits.0._source.path.to.ratio: 10.0 }
-  - match: { hits.hits.0._source.path.to.kw_default: [ "A", "B" ] }
+  - match: { hits.hits.0._source.path.to.kw_default: [ "B", "A" ] }
   - match: { hits.hits.0._source.path.to.kw_arrays: [ "B", "A" ] }
   - match: { hits.hits.0._source.path.to.kw_all: [ "B", "A" ] }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java
@@ -807,7 +807,7 @@ private static void parseNonDynamicArray(
         String fullPath = context.path().pathAsText(arrayFieldName);
 
         // Check if we need to record the array source. This only applies to synthetic source.
-        if (context.canAddIgnoredField()) {
+        if (context.canAddIgnoredField() && (mapper instanceof KeywordFieldMapper == false /* hard coded exclusion keyword mapper */)) {
             boolean objectRequiresStoringSource = mapper instanceof ObjectMapper objectMapper
                 && (objectMapper.storeArraySource()
                     || (context.sourceKeepModeFromIndexSettings() == Mapper.SourceKeepMode.ARRAYS
@@ -834,10 +834,13 @@ private static void parseNonDynamicArray(
                 }
         }
 
-        // In synthetic source, if any array element requires storing its source as-is, it takes precedence over
-        // elements from regular source loading that are then skipped from the synthesized array source.
-        // To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
-        context = context.cloneForArray(fullPath);
+        // hard coded exclusion keyword mapper:
+        if (mapper instanceof KeywordFieldMapper == false) {
+            // In synthetic source, if any array element requires storing its source as-is, it takes precedence over
+            // elements from regular source loading that are then skipped from the synthesized array source.
+            // To prevent this, we track each array name, to check if it contains any sub-arrays in its elements.
+            context = context.cloneForArray(fullPath);
+        }
 
         XContentParser parser = context.parser();
         XContentParser.Token token;
@@ -855,6 +858,10 @@ private static void parseNonDynamicArray(
                 parseValue(context, lastFieldName);
             }
         }
+        // hard coded post processing of arrays:
+        if (mapper instanceof KeywordFieldMapper k) {
+            k.processOffsets(context);
+        }
         postProcessDynamicArrayMapping(context, lastFieldName);
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java
@@ -27,11 +27,14 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 /**
  * Context used when parsing incoming documents. Holds everything that is needed to parse a document as well as
@@ -918,4 +921,19 @@ public String currentName() throws IOException {
             return field;
         }
     }
+
+    private final Map<String, SortedMap<String, List<Integer>>> arrayOffsetsByField = new HashMap<>();
+    private final Map<String, Integer> numValuesByField = new HashMap<>();
+
+    public SortedMap<String, List<Integer>> getOffsets(String field) {
+        return arrayOffsetsByField.computeIfAbsent(field , s -> new TreeMap<>(Comparator.naturalOrder()));
+    }
+
+    public void recordOffset(String fieldName, String value) {
+        int count = numValuesByField.compute(fieldName, (s, integer) -> integer == null ? 0 : ++integer);
+        var values = arrayOffsetsByField.computeIfAbsent(fieldName , s -> new TreeMap<>(Comparator.naturalOrder()));
+        var offsets = values.computeIfAbsent(value, s -> new ArrayList<>());
+        offsets.add(count);
+    }
+
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -13,6 +13,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.InvertableType;
@@ -34,6 +35,7 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.MinimizationOperations;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -65,17 +67,23 @@
 import org.elasticsearch.search.runtime.StringScriptFieldTermQuery;
 import org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentParser;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 import static org.apache.lucene.index.IndexWriter.MAX_TERM_LENGTH;
 import static org.elasticsearch.core.Strings.format;
@@ -89,6 +97,7 @@ public final class KeywordFieldMapper extends FieldMapper {
     private static final Logger logger = LogManager.getLogger(KeywordFieldMapper.class);
 
     public static final String CONTENT_TYPE = "keyword";
+    public static final String OFFSETS_FIELD_NAME_SUFFIX = "_offsets";
 
     static final NodeFeature KEYWORD_DIMENSION_IGNORE_ABOVE = new NodeFeature("mapper.keyword_dimension_ignore_above");
     static final NodeFeature KEYWORD_NORMALIZER_SYNTHETIC_SOURCE = new NodeFeature("mapper.keyword_normalizer_synthetic_source");
@@ -375,13 +384,26 @@ public KeywordFieldMapper build(MapperBuilderContext context) {
             }
             super.hasScript = script.get() != null;
             super.onScriptError = onScriptError.getValue();
+
+            BinaryFieldMapper offsetsFieldMapper;
+            if (context.isSourceSynthetic() && fieldtype.stored() == false) {
+                // keep track of value offsets so that we can reconstruct arrays from doc values in order as was specified during indexing
+                // (if field is stored then there is no point of doing this)
+                offsetsFieldMapper = new BinaryFieldMapper.Builder(leafName() + OFFSETS_FIELD_NAME_SUFFIX, context.isSourceSynthetic())
+                    .docValues(true)
+                    .build(context);
+            } else {
+                offsetsFieldMapper = null;
+            }
+
             return new KeywordFieldMapper(
                 leafName(),
                 fieldtype,
                 buildFieldType(context, fieldtype),
                 builderParams(this, context),
                 context.isSourceSynthetic(),
-                this
+                this,
+                offsetsFieldMapper
             );
         }
     }
@@ -882,14 +904,16 @@ public boolean hasNormalizer() {
     private final IndexAnalyzers indexAnalyzers;
     private final int ignoreAboveDefault;
     private final int ignoreAbove;
+    private final BinaryFieldMapper offsetsFieldMapper;
 
     private KeywordFieldMapper(
         String simpleName,
         FieldType fieldType,
         KeywordFieldType mappedFieldType,
         BuilderParams builderParams,
         boolean isSyntheticSource,
-        Builder builder
+        Builder builder,
+        BinaryFieldMapper offsetsFieldMapper
     ) {
         super(simpleName, mappedFieldType, builderParams);
         assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
@@ -906,17 +930,57 @@ private KeywordFieldMapper(
         this.isSyntheticSource = isSyntheticSource;
         this.ignoreAboveDefault = builder.ignoreAboveDefault;
         this.ignoreAbove = builder.ignoreAbove.getValue();
+        this.offsetsFieldMapper = offsetsFieldMapper;
     }
 
     @Override
     public KeywordFieldType fieldType() {
         return (KeywordFieldType) super.fieldType();
     }
 
-    @Override
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        final String value = context.parser().textOrNull();
-        indexValue(context, value == null ? fieldType().nullValue : value);
+        String value = context.parser().textOrNull();
+        if (value == null) {
+            value = fieldType().nullValue;
+        }
+        boolean indexed = indexValue(context, value);
+        if (offsetsFieldMapper != null && indexed) {
+            context.recordOffset(fullPath(), value);
+        }
+    }
+
+    public void processOffsets(DocumentParserContext context) throws IOException {
+        var values = context.getOffsets(fullPath());
+        if (values.isEmpty()) {
+            return;
+        }
+
+        int arrayLength = 0;
+        for (List<Integer> offsets : values.values()) {
+            for (Integer offset : offsets) {
+                arrayLength = Math.max(arrayLength, offset);
+            }
+        }
+        arrayLength += 1;
+
+        int ord = 0;
+        int[] offsetToOrd = new int[arrayLength];
+        for (var entry : values.entrySet()) {
+            for (var offsetAndLevel : entry.getValue()) {
+                offsetToOrd[offsetAndLevel] = ord;
+            }
+            ord++;
+        }
+
+        logger.info("values=" + values);
+        logger.info("offsetToOrd=" + Arrays.toString(offsetToOrd));
+
+        try (var streamOutput = new BytesStreamOutput()) {
+            // TODO: optimize
+            // This array allows to retain the original ordering of the leaf array and duplicate values.
+            streamOutput.writeVIntArray(offsetToOrd);
+            context.doc().add(new BinaryDocValuesField(offsetsFieldMapper.fullPath(), streamOutput.bytes().toBytesRef()));
+        }
     }
 
     @Override
@@ -929,13 +993,13 @@ protected void indexScriptValues(
         this.fieldType().scriptValues.valuesForDoc(searchLookup, readerContext, doc, value -> indexValue(documentParserContext, value));
     }
 
-    private void indexValue(DocumentParserContext context, String value) {
+    private boolean indexValue(DocumentParserContext context, String value) {
         if (value == null) {
-            return;
+            return false;
         }
         // if field is disabled, skip indexing
         if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) {
-            return;
+            return false;
         }
 
         if (value.length() > fieldType().ignoreAbove()) {
@@ -944,7 +1008,7 @@ private void indexValue(DocumentParserContext context, String value) {
                 // Save a copy of the field so synthetic source can load it
                 context.doc().add(new StoredField(originalName(), new BytesRef(value)));
             }
-            return;
+            return false;
         }
 
         value = normalizeValue(fieldType().normalizer(), fullPath(), value);
@@ -982,6 +1046,8 @@ private void indexValue(DocumentParserContext context, String value) {
         if (fieldType().hasDocValues() == false && fieldType.omitNorms()) {
             context.addToFieldNames(fieldType().name());
         }
+
+        return true;
     }
 
     private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) {
@@ -1078,7 +1144,8 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {
                 }
             });
         } else if (hasDocValues) {
-            layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath()) {
+            String offsetsFullPath = offsetsFieldMapper != null ? offsetsFieldMapper.fullPath() : null;
+            layers.add(new SortedSetDocValuesSyntheticFieldLoaderLayer(fullPath(), offsetsFullPath) {
 
                 @Override
                 protected BytesRef convert(BytesRef value) {