Skip to content

Commit

Permalink
Add a limit for graph phrase query expansion (#34031)
Browse files Browse the repository at this point in the history
Today query parsers throw TooManyClauses exception when a query creates
too many clauses. However graph phrase queries do not respect this limit.
This change adds a protection against crazy expansions that can happen when
building a graph phrase query. This is a temporary copy of the fix available
in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet.
This logic will be removed when we integrate the Lucene patch in a future
release.
  • Loading branch information
jimczi committed Sep 25, 2018
1 parent 0e73001 commit 01123d0
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand All @@ -60,11 +61,14 @@
import org.elasticsearch.index.query.support.QueryParsers;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;

public class MatchQuery {
public class MatchQuery {

private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(MappedFieldType.class));

Expand Down Expand Up @@ -525,6 +529,82 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq, Occur highFreqOccu
}
return query;
}

/**
* Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
* a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
* that this method can create.
*
* TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
*/
@Override
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
List<SpanQuery> clauses = new ArrayList<>();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
int maxBooleanClause = BooleanQuery.getMaxClauseCount();
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final SpanQuery queryPos;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
SpanQuery q = createSpanQuery(ts, field);
if (q != null) {
if (queries.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
queries.add(q);
}
}
if (queries.size() > 0) {
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
} else {
queryPos = null;
}
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
if (terms.length >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
for (int idx = 0; idx < terms.length; idx++) {
orClauses[idx] = new SpanTermQuery(terms[idx]);
}

queryPos = new SpanOrQuery(orClauses);
}
}

if (queryPos != null) {
if (clauses.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
clauses.add(queryPos);
}
}

if (clauses.isEmpty()) {
return null;
} else if (clauses.size() == 1) {
return clauses.get(0);
} else {
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
}
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@

package org.elasticsearch.index.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
Expand All @@ -31,6 +36,7 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.elasticsearch.Version;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
Expand All @@ -40,12 +46,15 @@
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.search.MatchQuery;
import org.elasticsearch.index.search.MatchQuery.Type;
import org.elasticsearch.index.search.MatchQuery.ZeroTermsQuery;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.test.AbstractQueryTestCase;
import org.hamcrest.Matcher;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -403,4 +412,73 @@ public void testLenientPhraseQuery() throws Exception {
assertThat(query.toString(),
containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
}

public void testMaxBooleanClause() {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
MatchQuery query = new MatchQuery(createShardContext());
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
}

private static class MockGraphAnalyzer extends Analyzer {
final CannedBinaryTokenStream.BinaryToken[] tokens;

private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
this.tokens = tokens;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
@Override
public TokenStream getTokenStream() {
return new CannedBinaryTokenStream(tokens);
}

@Override
protected void setReader(final Reader reader) {
}
};
}
}

/**
* Creates a graph token stream with 2 side paths at each position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
for (int i = 0; i < numPos;) {
if (i % 2 == 0) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
i += 2;
} else {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
i++;
}
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}

/**
* Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
* expansions at the last position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}
}

0 comments on commit 01123d0

Please sign in to comment.