From 0d381fb7505893ab05a111a04a7e2a337d248879 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sun, 25 Feb 2024 09:00:30 +0100 Subject: [PATCH 1/2] Expand small value sets to all case permutations in SearchValues --- .../Strings/StringSearchValues.cs | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index 86a13dd04b9b0..adc4ae9705b79 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; @@ -14,6 +15,8 @@ namespace System.Buffers { internal static class StringSearchValues { + private const int TeddyBucketCount = 8; + private static readonly SearchValues s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); @@ -248,6 +251,18 @@ static SearchValues PickAhoCorasickImplementation(AhoC Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion)); + // If we still have empty buckets we could use and we're ignoring case, we may be able to + // generate all possible permutations of the first N characters and switch to case-sensitive searching. + // E.g. ["ab", "c!"] => ["ab", "Ab" "aB", "AB", "c!", "C!"]. + // This won't apply to inputs with many letters (e.g. "abc" => 8 permutations on its own). + if (!asciiStartUnaffectedByCaseConversion && + values.Length < TeddyBucketCount && + TryGenerateAllCasePermutationsForPrefixes(values, n, TeddyBucketCount, out string[]? newValues)) + { + asciiStartUnaffectedByCaseConversion = true; + values = newValues; + } + if (asciiStartUnaffectedByCaseConversion) { return nonAsciiAffectedByCaseConversion @@ -280,7 +295,7 @@ private static SearchValues PickTeddyImplementation 8) { - string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n); + string[][] buckets = TeddyBucketizer.Bucketize(values, TeddyBucketCount, n); // Potential optimization: We don't have to pick the first N characters for the fingerprint. // Different offset selection can noticeably improve throughput (e.g. 2x). @@ -297,6 +312,68 @@ private static SearchValues PickTeddyImplementation values, int n, int maxValues, [NotNullWhen(true)] out string[]? newValues) + { + Debug.Assert(n is 2 or 3); + Debug.Assert(values.Length < maxValues); + + // Count how many possible permutations there are. + int newValuesCount = 0; + + foreach (string value in values) + { + int permutations = 1; + + foreach (char c in value.AsSpan(0, n)) + { + Debug.Assert(char.IsAscii(c)); + + if (char.IsAsciiLetter(c)) + { + permutations *= 2; + } + } + + newValuesCount += permutations; + } + + Debug.Assert(newValuesCount > values.Length, "Shouldn't have been called if there were no letters present"); + + if (newValuesCount > maxValues) + { + newValues = null; + return false; + } + + // Generate the permutations. + newValues = new string[newValuesCount]; + newValuesCount = 0; + + foreach (string value in values) + { + int start = newValuesCount; + + newValues[newValuesCount++] = value; + + for (int i = 0; i < n; i++) + { + char c = value[i]; + + if (char.IsAsciiLetter(c)) + { + // Copy all the previous permutations of this value but change the casing of the i-th character. + foreach (string previous in newValues.AsSpan(start, newValuesCount - start)) + { + newValues[newValuesCount++] = $"{previous.AsSpan(0, i)}{(char)(c ^ 0x20)}{previous.AsSpan(i + 1)}"; + } + } + } + } + + Debug.Assert(newValuesCount == newValues.Length); + return true; + } + private static SearchValues CreateForSingleValue( string value, HashSet? uniqueValues, From ac5966c60adc32195edc03dd3051b0c006bd9d19 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sun, 25 Feb 2024 09:16:58 +0100 Subject: [PATCH 2/2] Use the constant in one more place --- .../src/System/SearchValues/Strings/StringSearchValues.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index adc4ae9705b79..2bff05214c518 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -293,7 +293,7 @@ private static SearchValues PickTeddyImplementation 1); Debug.Assert(n is 2 or 3); - if (values.Length > 8) + if (values.Length > TeddyBucketCount) { string[][] buckets = TeddyBucketizer.Bucketize(values, TeddyBucketCount, n);