Add support for combined_field query (opensearch-project#4699)

This adds support for the CombinedFieldQuery from the Lucene sandbox. The supported syntax is as follows: ``` { "combined_field": { "query" : "quick brown fox", // required "fields" : [ // if no fields specified then matches nothing "a_text_field", // must be text field, else will be ignored "a_text_field_with_weight^5" ], "analyzer" : "custom_analyzer", // optional "zero_terms_query" : "none" //optional } } ``` If no analyzer is specified, terms are derived from the union of terms from all fields' analyzers. The behavior of zero_terms_query is like for multi_match. Fixes: - opensearch-project#3996 Signed-off-by: Michael Froh <[email protected]>
msfroh · Oct 6, 2022 · f6b82e1 · f6b82e1
1 parent ed359f0
commit f6b82e1
Show file tree

Hide file tree

Showing 6 changed files with 549 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Added release notes for 1.3.6 ([#4681](https://github.com/opensearch-project/OpenSearch/pull/4681))
 - Added precommit support for MacOS ([#4682](https://github.com/opensearch-project/OpenSearch/pull/4682))
 - Recommission API changes for service layer ([#4320](https://github.com/opensearch-project/OpenSearch/pull/4320))
+- Add support for combined field query ([#4699](https://github.com/opensearch-project/OpenSearch/pull/4699))
 ### Dependencies
 - Bumps `log4j-core` from 2.18.0 to 2.19.0
 - Bumps `reactor-netty-http` from 1.0.18 to 1.0.23

diff --git a/server/src/main/java/org/opensearch/index/query/CombinedFieldQueryBuilder.java b/server/src/main/java/org/opensearch/index/query/CombinedFieldQueryBuilder.java
@@ -0,0 +1,335 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.query;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.sandbox.search.CombinedFieldQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.common.ParseField;
+import org.opensearch.common.ParsingException;
+import org.opensearch.common.Strings;
+import org.opensearch.common.io.stream.StreamInput;
+import org.opensearch.common.io.stream.StreamOutput;
+import org.opensearch.common.lucene.search.Queries;
+import org.opensearch.common.xcontent.XContentBuilder;
+import org.opensearch.common.xcontent.XContentParser;
+import org.opensearch.index.mapper.MappedFieldType;
+import org.opensearch.index.mapper.TextSearchInfo;
+import org.opensearch.index.search.MatchQuery;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.TreeMap;
+
+import static org.opensearch.index.query.MultiMatchQueryBuilder.parseFieldAndBoost;
+
+/**
+ * Combined field query is a pure disjunction of query terms across the given fields,
+ * where scoring pretends that the terms were all indexed to a single field.
+ * 
+ * @see CombinedFieldQuery
+ * 
+ * @opensearch.internal
+ */
+public class CombinedFieldQueryBuilder extends AbstractQueryBuilder<CombinedFieldQueryBuilder> {
+    public static final String NAME = "combined_field";
+
+    private static final ParseField FIELDS_FIELD = new ParseField("fields");
+    private static final ParseField QUERY_FIELD = new ParseField("query");
+    private static final ParseField ANALYZER_FIELD = new ParseField("analyzer");
+    private static final ParseField ZERO_TERMS_QUERY_FIELD = new ParseField("zero_terms_query");
+
+    private final Object value;
+    private final Map<String, Float> fieldWeights;
+    private String analyzer;
+    private MatchQuery.ZeroTermsQuery zeroTermsQuery = MatchQuery.DEFAULT_ZERO_TERMS_QUERY;
+
+    public CombinedFieldQueryBuilder(Object value) {
+        if (value == null) {
+            throw new IllegalArgumentException("[" + NAME + "] requires query value");
+        }
+        this.value = value;
+        this.fieldWeights = new TreeMap<>();
+    }
+
+    public CombinedFieldQueryBuilder(StreamInput in) throws IOException {
+        super(in);
+        value = in.readGenericValue();
+        int size = in.readVInt();
+        fieldWeights = new TreeMap<>();
+        for (int i = 0; i < size; i++) {
+            String field = in.readString();
+            float weight = in.readFloat();
+            checkValidWeight(weight);
+            fieldWeights.put(field, weight);
+        }
+        analyzer = in.readOptionalString();
+        zeroTermsQuery = MatchQuery.ZeroTermsQuery.readFromStream(in);
+    }
+
+    @Override
+    protected void doWriteTo(StreamOutput out) throws IOException {
+        out.writeGenericValue(value);
+        out.writeVInt(fieldWeights.size());
+        for (Map.Entry<String, Float> fieldsEntry : fieldWeights.entrySet()) {
+            out.writeString(fieldsEntry.getKey());
+            out.writeFloat(fieldsEntry.getValue());
+        }
+        out.writeOptionalString(analyzer);
+        zeroTermsQuery.writeTo(out);
+    }
+
+    @Override
+    public String getWriteableName() {
+        return NAME;
+    }
+
+    @Override
+    protected void doXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject(NAME);
+        builder.field(QUERY_FIELD.getPreferredName(), value);
+        builder.startArray(FIELDS_FIELD.getPreferredName());
+        for (Map.Entry<String, Float> fieldEntry : fieldWeights.entrySet()) {
+            builder.value(fieldEntry.getKey() + "^" + fieldEntry.getValue());
+        }
+        builder.endArray();
+        if (analyzer != null) {
+            builder.field(ANALYZER_FIELD.getPreferredName(), analyzer);
+        }
+        builder.field(ZERO_TERMS_QUERY_FIELD.getPreferredName(), zeroTermsQuery.toString());
+        printBoostAndQueryName(builder);
+        builder.endObject();
+    }
+
+    public static CombinedFieldQueryBuilder fromXContent(XContentParser parser) throws IOException {
+        Object value = null;
+        Map<String, Float> fieldWeights = new HashMap<>();
+        String analyzer = null;
+        MatchQuery.ZeroTermsQuery zeroTermsQuery = MatchQuery.DEFAULT_ZERO_TERMS_QUERY;
+
+        float boost = AbstractQueryBuilder.DEFAULT_BOOST;
+        String queryName = null;
+
+        XContentParser.Token token;
+        String currentFieldName = null;
+        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+            if (token == XContentParser.Token.FIELD_NAME) {
+                currentFieldName = parser.currentName();
+            } else if (FIELDS_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                if (token == XContentParser.Token.START_ARRAY) {
+                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                        parseFieldAndBoost(parser, fieldWeights);
+                    }
+                } else if (token.isValue()) {
+                    parseFieldAndBoost(parser, fieldWeights);
+                } else {
+                    throw new ParsingException(
+                        parser.getTokenLocation(),
+                        "[" + NAME + "] query does not support this value for [" + currentFieldName + "]"
+                    );
+                }
+            } else if (token.isValue()) {
+                if (QUERY_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                    value = parser.objectText();
+                } else if (ANALYZER_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                    analyzer = parser.text();
+                } else if (ZERO_TERMS_QUERY_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                    String zeroTermsValue = parser.text();
+                    if ("none".equalsIgnoreCase(zeroTermsValue)) {
+                        zeroTermsQuery = MatchQuery.ZeroTermsQuery.NONE;
+                    } else if ("all".equalsIgnoreCase(zeroTermsValue)) {
+                        zeroTermsQuery = MatchQuery.ZeroTermsQuery.ALL;
+                    } else {
+                        throw new ParsingException(
+                            parser.getTokenLocation(),
+                            "Unsupported zero_terms_query value [" + zeroTermsValue + "]"
+                        );
+                    }
+                } else if (AbstractQueryBuilder.NAME_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                    queryName = parser.text();
+                } else if (AbstractQueryBuilder.BOOST_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
+                    boost = parser.floatValue();
+                } else {
+                    throw new ParsingException(
+                        parser.getTokenLocation(),
+                        "[" + NAME + "] query does not support [" + currentFieldName + "]"
+                    );
+                }
+            } else if (token == XContentParser.Token.START_OBJECT) {
+                throw new ParsingException(
+                    parser.getTokenLocation(),
+                    "[" + NAME + "] query does not support this value for [" + currentFieldName + "]"
+                );
+            }
+        }
+
+        if (value == null) {
+            throw new ParsingException(parser.getTokenLocation(), "No text specified for " + NAME + " query");
+        }
+
+        return new CombinedFieldQueryBuilder(value).fields(fieldWeights)
+            .analyzer(analyzer)
+            .zeroTermsQuery(zeroTermsQuery)
+            .queryName(queryName)
+            .boost(boost);
+    }
+
+    public Object value() {
+        return value;
+    }
+
+    /**
+     * Adds a field to match against.
+     */
+    public CombinedFieldQueryBuilder field(String field) {
+        return field(field, AbstractQueryBuilder.DEFAULT_BOOST);
+    }
+
+    /**
+     * Adds a field to match against with a specific weight.
+     */
+    public CombinedFieldQueryBuilder field(String field, float weight) {
+        if (Strings.isEmpty(field)) {
+            throw new IllegalArgumentException("supplied field is null or empty.");
+        }
+        checkValidWeight(weight);
+        this.fieldWeights.put(field, weight);
+        return this;
+    }
+
+    private static void checkValidWeight(float weight) {
+        if (Float.compare(weight, 1f) < 0) {
+            throw new IllegalArgumentException(("weights must be greater or equal to 1 in [" + NAME + "]"));
+        }
+    }
+
+    /**
+     * Add several fields to run the query against with a specific weight.
+     */
+    public CombinedFieldQueryBuilder fields(Map<String, Float> fields) {
+        for (Map.Entry<String, Float> fieldWeight : fields.entrySet()) {
+            field(fieldWeight.getKey(), fieldWeight.getValue());
+        }
+        return this;
+    }
+
+    public Map<String, Float> fields() {
+        return fieldWeights;
+    }
+
+    /**
+     * Explicitly set the analyzer to use. Defaults to use explicit mapping config for the field, or, if not
+     * set, the default search analyzer.
+     */
+    public CombinedFieldQueryBuilder analyzer(String analyzer) {
+        this.analyzer = analyzer;
+        return this;
+    }
+
+    public String analyzer() {
+        return analyzer;
+    }
+
+    public CombinedFieldQueryBuilder zeroTermsQuery(MatchQuery.ZeroTermsQuery zeroTermsQuery) {
+        if (zeroTermsQuery == null) {
+            throw new IllegalArgumentException("[" + NAME + "] requires zero terms query to be non-null");
+        }
+        this.zeroTermsQuery = zeroTermsQuery;
+        return this;
+    }
+
+    public MatchQuery.ZeroTermsQuery zeroTermsQuery() {
+        return zeroTermsQuery;
+    }
+
+    static boolean isTextField(MappedFieldType fieldType) {
+        if (fieldType == null) {
+            return false;
+        }
+        return fieldType.getTextSearchInfo() != TextSearchInfo.NONE && fieldType.getTextSearchInfo() != TextSearchInfo.SIMPLE_MATCH_ONLY;
+    }
+
+    @Override
+    protected Query doToQuery(QueryShardContext context) throws IOException {
+        boolean hasTextField = fieldWeights.keySet().stream().anyMatch(k -> isTextField(context.fieldMapper(k)));
+        if (hasTextField == false) {
+            return Queries.newUnmappedFieldsQuery(fieldWeights.keySet());
+        }
+        Analyzer explicitAnalyzer = null;
+        if (analyzer != null) {
+            explicitAnalyzer = context.getMapperService().getIndexAnalyzers().get(analyzer);
+            if (explicitAnalyzer == null) {
+                throw new IllegalArgumentException("No analyzer found for [" + analyzer + "]");
+            }
+        }
+
+        CombinedFieldQuery.Builder builder = new CombinedFieldQuery.Builder();
+        boolean hasTerms = false;
+        for (Map.Entry<String, Float> fieldWeight : fieldWeights.entrySet()) {
+            String fieldName = fieldWeight.getKey();
+            MappedFieldType fieldType = context.fieldMapper(fieldName);
+            if (isTextField(fieldType) == false) {
+                // ignore unmapped fields or fields that do not support text search
+                continue;
+            }
+            builder.addField(fieldName, fieldWeight.getValue());
+
+            Analyzer fieldAnalyzer;
+            if (explicitAnalyzer == null) {
+                // Use per-field analyzer
+                fieldAnalyzer = context.getSearchAnalyzer(fieldType);
+            } else {
+                fieldAnalyzer = explicitAnalyzer;
+            }
+            hasTerms = collectAllTerms(fieldName, fieldAnalyzer, value.toString(), builder) || hasTerms;
+        }
+        if (hasTerms == false) {
+            switch (zeroTermsQuery) {
+                case NONE:
+                    return Queries.newMatchNoDocsQuery("Matching no documents because no terms present");
+                case ALL:
+                    return Queries.newMatchAllQuery();
+                default:
+                    throw new IllegalStateException("unknown zeroTermsQuery " + zeroTermsQuery);
+            }
+        }
+        return builder.build();
+    }
+
+    private static boolean collectAllTerms(String fieldName, Analyzer analyzer, String queryString, CombinedFieldQuery.Builder builder)
+        throws IOException {
+        boolean hasTerms = false;
+        TokenStream tokenStream = analyzer.tokenStream(fieldName, queryString);
+        TermToBytesRefAttribute termAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
+        tokenStream.reset();
+        while (tokenStream.incrementToken()) {
+            builder.addTerm(BytesRef.deepCopyOf(termAtt.getBytesRef()));
+            hasTerms = true;
+        }
+        tokenStream.close();
+        return hasTerms;
+    }
+
+    @Override
+    protected boolean doEquals(CombinedFieldQueryBuilder other) {
+        return Objects.equals(value, other.value)
+            && Objects.equals(fieldWeights, other.fieldWeights)
+            && Objects.equals(analyzer, other.analyzer);
+    }
+
+    @Override
+    protected int doHashCode() {
+        return Objects.hash(value, fieldWeights, analyzer);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/query/MultiMatchQueryBuilder.java b/server/src/main/java/org/opensearch/index/query/MultiMatchQueryBuilder.java
@@ -779,7 +779,7 @@ public static MultiMatchQueryBuilder fromXContent(XContentParser parser) throws
         return builder;
     }
 
-    private static void parseFieldAndBoost(XContentParser parser, Map<String, Float> fieldsBoosts) throws IOException {
+    static void parseFieldAndBoost(XContentParser parser, Map<String, Float> fieldsBoosts) throws IOException {
         String fField = null;
         Float fBoost = AbstractQueryBuilder.DEFAULT_BOOST;
         char[] fieldText = parser.textCharacters();

diff --git a/server/src/main/java/org/opensearch/search/SearchModule.java b/server/src/main/java/org/opensearch/search/SearchModule.java
@@ -48,6 +48,7 @@
 import org.opensearch.common.xcontent.XContentParser;
 import org.opensearch.index.query.BoolQueryBuilder;
 import org.opensearch.index.query.BoostingQueryBuilder;
+import org.opensearch.index.query.CombinedFieldQueryBuilder;
 import org.opensearch.index.query.CommonTermsQueryBuilder;
 import org.opensearch.index.query.ConstantScoreQueryBuilder;
 import org.opensearch.index.query.DisMaxQueryBuilder;
@@ -1195,6 +1196,9 @@ private void registerQueryParsers(List<SearchPlugin> plugins) {
         if (ShapesAvailability.JTS_AVAILABLE && ShapesAvailability.SPATIAL4J_AVAILABLE) {
             registerQuery(new QuerySpec<>(GeoShapeQueryBuilder.NAME, GeoShapeQueryBuilder::new, GeoShapeQueryBuilder::fromXContent));
         }
+        registerQuery(
+            new QuerySpec<>(CombinedFieldQueryBuilder.NAME, CombinedFieldQueryBuilder::new, CombinedFieldQueryBuilder::fromXContent)
+        );
 
         registerFromPlugin(plugins, SearchPlugin::getQueries, this::registerQuery);
     }