Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand small value sets to all case permutations in SearchValues<string> #98902

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
Expand All @@ -14,6 +15,8 @@ namespace System.Buffers
{
internal static class StringSearchValues
{
private const int TeddyBucketCount = 8;

private static readonly SearchValues<char> s_asciiLetters =
SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");

Expand Down Expand Up @@ -248,6 +251,18 @@ static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoC

Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion));

// If we still have empty buckets we could use and we're ignoring case, we may be able to
// generate all possible permutations of the first N characters and switch to case-sensitive searching.
// E.g. ["ab", "c!"] => ["ab", "Ab" "aB", "AB", "c!", "C!"].
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
// This won't apply to inputs with many letters (e.g. "abc" => 8 permutations on its own).
if (!asciiStartUnaffectedByCaseConversion &&
values.Length < TeddyBucketCount &&
TryGenerateAllCasePermutationsForPrefixes(values, n, TeddyBucketCount, out string[]? newValues))
{
asciiStartUnaffectedByCaseConversion = true;
values = newValues;
}

if (asciiStartUnaffectedByCaseConversion)
{
return nonAsciiAffectedByCaseConversion
Expand Down Expand Up @@ -278,9 +293,9 @@ private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivit
Debug.Assert(values.Length > 1);
Debug.Assert(n is 2 or 3);

if (values.Length > 8)
if (values.Length > TeddyBucketCount)
{
string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n);
string[][] buckets = TeddyBucketizer.Bucketize(values, TeddyBucketCount, n);

// Potential optimization: We don't have to pick the first N characters for the fingerprint.
// Different offset selection can noticeably improve throughput (e.g. 2x).
Expand All @@ -297,6 +312,68 @@ private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivit
}
}

private static bool TryGenerateAllCasePermutationsForPrefixes(ReadOnlySpan<string> values, int n, int maxValues, [NotNullWhen(true)] out string[]? newValues)
{
Debug.Assert(n is 2 or 3);
Debug.Assert(values.Length < maxValues);

// Count how many possible permutations there are.
int newValuesCount = 0;

foreach (string value in values)
{
int permutations = 1;

foreach (char c in value.AsSpan(0, n))
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
{
Debug.Assert(char.IsAscii(c));
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved

if (char.IsAsciiLetter(c))
{
permutations *= 2;
}
}

newValuesCount += permutations;
}

Debug.Assert(newValuesCount > values.Length, "Shouldn't have been called if there were no letters present");

if (newValuesCount > maxValues)
{
newValues = null;
return false;
}

// Generate the permutations.
newValues = new string[newValuesCount];
newValuesCount = 0;

foreach (string value in values)
{
int start = newValuesCount;

newValues[newValuesCount++] = value;

for (int i = 0; i < n; i++)
{
char c = value[i];

if (char.IsAsciiLetter(c))
{
// Copy all the previous permutations of this value but change the casing of the i-th character.
foreach (string previous in newValues.AsSpan(start, newValuesCount - start))
{
newValues[newValuesCount++] = $"{previous.AsSpan(0, i)}{(char)(c ^ 0x20)}{previous.AsSpan(i + 1)}";
}
}
}
}

Debug.Assert(newValuesCount == newValues.Length);
return true;
}

private static SearchValues<string> CreateForSingleValue(
string value,
HashSet<string>? uniqueValues,
Expand Down