Skip to content

Commit

Permalink
RavenDB-22937 Only new Corax indexes will be adjusted to the new beha…
Browse files Browse the repository at this point in the history
…vior for wildcard queries
  • Loading branch information
maciejaszyk authored and ppekrol committed Oct 11, 2024
1 parent 5311aed commit 8abd28e
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 37 deletions.
204 changes: 200 additions & 4 deletions src/Corax/Querying/IndexSearcher.Search.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,24 @@ namespace Corax.Querying;

public partial class IndexSearcher
{
public enum SearchQueryOptions
{
Legacy,
PhraseQuery,
PhraseQueryWithWildcardAdjustments
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public IQueryMatch SearchQuery(in FieldMetadata field, IEnumerable<string> values, Constants.Search.Operator @operator, bool? supportsPhraseQuery = null, in CancellationToken cancellationToken = default)
public IQueryMatch SearchQuery(in FieldMetadata field, IEnumerable<string> values, Constants.Search.Operator @operator, SearchQueryOptions searchQueryOptions = SearchQueryOptions.PhraseQueryWithWildcardAdjustments, in CancellationToken cancellationToken = default)
{
return supportsPhraseQuery is true or null
? SearchQueryWithPhraseQuery(field, values, @operator, cancellationToken)
: SearchQueryLegacy(field, values, @operator, cancellationToken);
return searchQueryOptions switch
{
SearchQueryOptions.Legacy => SearchQueryLegacy(field, values, @operator, cancellationToken),
SearchQueryOptions.PhraseQueryWithWildcardAdjustments =>
SearchQueryWithPhraseQueryWithWildcardQueriesAdjustments(field, values, @operator, cancellationToken),
SearchQueryOptions.PhraseQuery => SearchQueryWithPhraseQuery(field, values, @operator, cancellationToken),
_ => throw new ArgumentOutOfRangeException(nameof(searchQueryOptions))
};
}

private IQueryMatch SearchQueryLegacy(FieldMetadata field, IEnumerable<string> values, Constants.Search.Operator @operator, in CancellationToken cancellationToken)
Expand Down Expand Up @@ -172,7 +184,191 @@ IEnumerable<Token> GetTokens(string source)

}


private IQueryMatch SearchQueryWithPhraseQuery(FieldMetadata field, IEnumerable<string> values, Constants.Search.Operator @operator, in CancellationToken cancellationToken = default)
{
AssertFieldIsSearched();
var searchAnalyzer = field.IsDynamic
? _fieldMapping.SearchAnalyzer(field.FieldName.ToString())
: field.Analyzer;

field = field.ChangeAnalyzer(field.Mode, searchAnalyzer);

Analyzer wildcardAnalyzer = null;
IQueryMatch searchQuery = null;

List<Slice> termMatches = null;
var terms = new ContextBoundNativeList<Slice>(Allocator);
foreach (var word in values)
{
var tokensInWord = CountTokens(word, out var token);

if (tokensInWord == 0)
continue;

//Single word
if (tokensInWord == 1)
{
var value = word.AsSpan(token.Offset, (int)token.Length);
var termType = GetTermType(value);
(int startIncrement, int lengthIncrement, Analyzer analyzer) = termType switch
{
Constants.Search.SearchMatchOptions.StartsWith => (0, -1, CreateWildcardAnalyzer(field, ref wildcardAnalyzer)),
Constants.Search.SearchMatchOptions.EndsWith => (1, 0, CreateWildcardAnalyzer(field, ref wildcardAnalyzer)),
Constants.Search.SearchMatchOptions.Contains => (1, -1, CreateWildcardAnalyzer(field, ref wildcardAnalyzer)),
Constants.Search.SearchMatchOptions.TermMatch => (0, 0, searchAnalyzer),
Constants.Search.SearchMatchOptions.Exists => (0, 0, searchAnalyzer),
_ => throw new InvalidExpressionException("Unknown flag inside Search match.")
};

var termReadyToAnalyze = value.Slice(startIncrement, value.Length - startIncrement + lengthIncrement);

if (termType is Constants.Search.SearchMatchOptions.TermMatch)
{
termMatches ??= new();
terms.Clear(); // Clear the terms list.
EncodeAndApplyAnalyzerForMultipleTerms(field, word, ref terms);

//When single term outputs multiple terms we've to jump into phraseQuery
if (terms.Count > 1)
goto PhraseQuery;

foreach (var term in terms.GetEnumerator())
{
if (term.Size == 0)
continue; //skip empty results

termMatches.Add(term);
}
continue;
}

Slice analyzedTerm = default;

if (termType is not Constants.Search.SearchMatchOptions.Exists)
{
analyzedTerm = EncodeAndApplyAnalyzer(field, analyzer, termReadyToAnalyze);
if (analyzedTerm.Size == 0)
continue; //skip empty results
}

var query = termType switch
{
Constants.Search.SearchMatchOptions.TermMatch => throw new InvalidDataException(
$"{nameof(TermMatch)} is handled in different part of evaluator. This is a bug."),
Constants.Search.SearchMatchOptions.Exists => ExistsQuery(field, token: cancellationToken),
Constants.Search.SearchMatchOptions.StartsWith => StartWithQuery(field, analyzedTerm, token: cancellationToken),
Constants.Search.SearchMatchOptions.EndsWith => EndsWithQuery(field, analyzedTerm, token: cancellationToken),
Constants.Search.SearchMatchOptions.Contains => ContainsQuery(field, analyzedTerm, token: cancellationToken),
_ => throw new ArgumentOutOfRangeException(nameof(termType), termType.ToString())
};

if (searchQuery is null)
{
searchQuery = query;
continue;
}

searchQuery = @operator switch
{
Constants.Search.Operator.Or => Or<IQueryMatch, MultiTermMatch>(searchQuery, query, token: cancellationToken),
Constants.Search.Operator.And => And<IQueryMatch, MultiTermMatch>(searchQuery, query, token: cancellationToken),
_ => throw new ArgumentOutOfRangeException(nameof(@operator), @operator, null)
};

continue;
}

//Phrase query
terms.Clear();
EncodeAndApplyAnalyzerForMultipleTerms(field, word, ref terms);

if (terms.Count == 0)
continue; //sentence contained only stop-words
PhraseQuery:
var hs = new HashSet<Slice>(SliceComparer.Instance);
for (var i = 0; i < terms.Count; ++i)
{
hs.Add(terms[i]);
}

var allIn = AllInQuery(field, hs, cancellationToken: cancellationToken);

var phraseMatch = PhraseQuery(allIn, field, terms.ToSpan());

searchQuery = (searchQuery, @operator) switch
{
(null, _) => phraseMatch,
(_, Constants.Search.Operator.And) => And(searchQuery, phraseMatch, cancellationToken),
(_, Constants.Search.Operator.Or) => Or(searchQuery, phraseMatch, cancellationToken),
_ => throw new ArgumentOutOfRangeException($"({searchQuery?.GetType().FullName ?? "NULL"}, {@operator.ToString()})")
};
}

if (termMatches?.Count > 0)
{
var termMatchesQuery = @operator switch
{
Constants.Search.Operator.And => AllInQuery(field, termMatches.ToHashSet(SliceComparer.Instance), token: cancellationToken),
Constants.Search.Operator.Or => InQuery(field, termMatches, token: cancellationToken),
_ => throw new ArgumentOutOfRangeException(nameof(@operator), @operator, null)
};

if (searchQuery is null)
searchQuery = termMatchesQuery;
else
{
searchQuery = @operator switch
{
Constants.Search.Operator.Or => Or(termMatchesQuery, searchQuery),
Constants.Search.Operator.And => And(termMatchesQuery, searchQuery),
_ => throw new ArgumentOutOfRangeException(nameof(@operator), @operator, null)
};
}
}


void AssertFieldIsSearched()
{
if (field.Analyzer == null && field.IsDynamic == false)
throw new InvalidOperationException($"{nameof(SearchQueryWithPhraseQuery)} requires analyzer.");
}

wildcardAnalyzer?.Dispose();

return searchQuery ?? TermMatch.CreateEmpty(this, Allocator);

//In pharse query we expect to have multiple tokens, for most cases
int CountTokens(in string source, out Token termToken)
{
int count = 0;
termToken = default;

if (string.IsNullOrEmpty(source))
return count;

var i = 0;
while (i < source.Length)
{
while (i < source.Length && source[i] == ' ')
i++;

int start = i;
while (i < source.Length && source[i] != ' ')
i++;

if (start != i)
{
termToken = new Token() {Length = (uint)(i - start), Offset = start, Type = TokenType.Word};
count++;
}
}

return count;
}
}

private IQueryMatch SearchQueryWithPhraseQueryWithWildcardQueriesAdjustments(FieldMetadata field, IEnumerable<string> values, Constants.Search.Operator @operator, in CancellationToken cancellationToken = default)
{
AssertFieldIsSearched();
IQueryMatch searchQuery = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ public static class IndexVersion
public const long StoreOnlySupportInCoraxIndexes = 60_003; // RavenDB-22369
public const long JavaScriptProperlyHandleDynamicFieldsIndexFields = 60_004; //RavenDB-22363
public const long UseNonExistingPostingList = 60_005; // RavenDB-22703
public const long CoraxSearchWildcardAdjustment = 60_006; // RavenDB-22937

/// <summary>
/// Remember to bump this
/// </summary>
public const long CurrentVersion = UseNonExistingPostingList;
public const long CurrentVersion = CoraxSearchWildcardAdjustment;

public static bool IsTimeTicksInJavaScriptIndexesSupported(long indexVersion)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,12 @@ private static IQueryMatch HandleSearch(Parameters builderParameters, MethodExpr
var fieldsToFetch = builderParameters.FieldsToFetch;
var indexFieldsMapping = builderParameters.IndexFieldsMapping;
var allocator = builderParameters.Allocator;
var supportsPhraseQuery = builderParameters.Index.Definition.Version >= IndexDefinitionBaseServerSide.IndexVersion.PhraseQuerySupportInCoraxIndexes;
IndexSearcher.SearchQueryOptions searchQueryOptions = IndexSearcher.SearchQueryOptions.Legacy;
if (builderParameters.Index.Definition.Version >= IndexDefinitionBaseServerSide.IndexVersion.CoraxSearchWildcardAdjustment)
searchQueryOptions = IndexSearcher.SearchQueryOptions.PhraseQueryWithWildcardAdjustments;
else if (builderParameters.Index.Definition.Version >= IndexDefinitionBaseServerSide.IndexVersion.PhraseQuerySupportInCoraxIndexes)
searchQueryOptions = IndexSearcher.SearchQueryOptions.PhraseQuery;


QueryFieldName fieldName;
var isDocumentId = false;
Expand Down Expand Up @@ -1081,34 +1086,8 @@ private static IQueryMatch HandleSearch(Parameters builderParameters, MethodExpr
builderParameters.DynamicFields, handleSearch: true, hasBoost: builderParameters.HasBoost);

// Wildcard queries:
if (supportsPhraseQuery && valueAsString.Length >= 1 && (valueAsString[0] == '*' || (valueAsString.Length >= 2 && valueAsString[^1] == '*')))
{
// We need to retrieve the analyzer for the dynamic field since the field metadata is created dynamically.
if (fieldMetadata.IsDynamic)
fieldMetadata = fieldMetadata.ChangeAnalyzer(fieldMetadata.Mode, builderParameters.IndexFieldsMapping.SearchAnalyzer(fieldMetadata.FieldName.ToString()));


if (fieldMetadata.Analyzer is LuceneAnalyzerAdapter laa)
{
//logic from LuceneQueryBuilder
var luceneAnalyzer = laa.Analyzer switch
{
KeywordAnalyzer keywordAnalyzer => builderParameters.IndexFieldsMapping.ExactAnalyzer(fieldMetadata.FieldName.ToString()),
// here we force a lower case keyword analyzer to ensure proper behavior
// https://ayende.com/blog/191841-B/understanding-query-processing-and-wildcards-in-ravendb
RavenStandardAnalyzer or NGramAnalyzer => builderParameters.IndexFieldsMapping.DefaultAnalyzer,
LowerCaseKeywordAnalyzer or CollationAnalyzer => builderParameters.IndexFieldsMapping.DefaultAnalyzer,
// if the user has a custom analyzer, we'll use that, and they can deal with any surprises
// in wildcard queries
_ => null
};

if (luceneAnalyzer != null)
fieldMetadata = fieldMetadata.ChangeAnalyzer(FieldIndexingMode.Search, luceneAnalyzer);
}

// Currently, we do not have any custom Corax analyzers, so we don't need to address them.
}
if (searchQueryOptions is IndexSearcher.SearchQueryOptions.PhraseQueryWithWildcardAdjustments && valueAsString.Length >= 1 && (valueAsString[0] == '*' || (valueAsString.Length >= 2 && valueAsString[^1] == '*')))
fieldMetadata = ReplaceAnalyzerForWildcardQueries(fieldMetadata);


if (proximity.HasValue)
Expand All @@ -1132,7 +1111,7 @@ private static IQueryMatch HandleSearch(Parameters builderParameters, MethodExpr
QueryBuilderHelper.ThrowInvalidOperatorInSearch(metadata, queryParameters, fieldExpression);
}

return indexSearcher.SearchQuery(fieldMetadata, GetValues(), @operator, supportsPhraseQuery, builderParameters.Token);
return indexSearcher.SearchQuery(fieldMetadata, GetValues(), @operator, searchQueryOptions, builderParameters.Token);

/*
* Here we need to deal with value that comes from the user, which means that we
Expand Down Expand Up @@ -1229,6 +1208,37 @@ bool IsEscaped(string input, int index)

return (count & 1) == 1;
}

FieldMetadata ReplaceAnalyzerForWildcardQueries(in FieldMetadata original)
{
FieldMetadata result = original;
// We need to retrieve the analyzer for the dynamic field since the field metadata is created dynamically.
if (original.IsDynamic)
result = fieldMetadata.ChangeAnalyzer(original.Mode, builderParameters.IndexFieldsMapping.SearchAnalyzer(original.FieldName.ToString()));


if (original.Analyzer is LuceneAnalyzerAdapter laa)
{
//logic from LuceneQueryBuilder
var luceneAnalyzer = laa.Analyzer switch
{
KeywordAnalyzer keywordAnalyzer => builderParameters.IndexFieldsMapping.ExactAnalyzer(original.FieldName.ToString()),
// here we force a lower case keyword analyzer to ensure proper behavior
// https://ayende.com/blog/191841-B/understanding-query-processing-and-wildcards-in-ravendb
RavenStandardAnalyzer or NGramAnalyzer => builderParameters.IndexFieldsMapping.DefaultAnalyzer,
LowerCaseKeywordAnalyzer or CollationAnalyzer => builderParameters.IndexFieldsMapping.DefaultAnalyzer,
// if the user has a custom analyzer, we'll use that, and they can deal with any surprises
// in wildcard queries
_ => null
};

if (luceneAnalyzer != null)
result = original.ChangeAnalyzer(FieldIndexingMode.Search, luceneAnalyzer);
}

// Currently, we do not have any custom Corax analyzers, so we don't need to address them.
return result;
}
}

private static IQueryMatch HandleSpatial(Parameters builderParameters, MethodExpression expression, MethodType spatialMethod)
Expand Down
Binary file not shown.
Loading

0 comments on commit 8abd28e

Please sign in to comment.