diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs index 5369038f7e546..8582703ffdcb7 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs @@ -232,24 +232,26 @@ private static FrozenDictionary ChooseImplementationOptimizedForRe Dictionary stringEntries = (Dictionary)(object)source; IEqualityComparer stringComparer = (IEqualityComparer)(object)comparer; + // this array is needed for every strategy + string[] entries = (string[])(object)source.Keys.ToArray(); + // Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this. int minLength = int.MaxValue, maxLength = 0; - foreach (KeyValuePair kvp in stringEntries) + foreach (string key in entries) { - if (kvp.Key.Length < minLength) minLength = kvp.Key.Length; - if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length; + if (key.Length < minLength) minLength = key.Length; + if (key.Length > maxLength) maxLength = key.Length; } Debug.Assert(minLength >= 0 && maxLength >= minLength); // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length. - FrozenDictionary? frozenDictionary = LengthBucketsFrozenDictionary.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength); + FrozenDictionary? frozenDictionary = LengthBucketsFrozenDictionary.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength, entries); if (frozenDictionary is not null) { return (FrozenDictionary)(object)frozenDictionary; } // Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys. - string[] entries = (string[])(object)source.Keys.ToArray(); KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength); if (analysis.SubstringHashing) { diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs index ee80e6a08549a..9da8b3c61e573 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs @@ -46,32 +46,29 @@ public static AnalysisResults Analyze( /// Try to find the minimal unique substring index/length to use for comparisons. private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results) { - const double SufficientUniquenessFactor = 0.95; // 95% is good enough const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings - SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer(); - HashSet leftSet = new HashSet( + SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer(); + HashSet set = new HashSet( #if NET6_0_OR_GREATER uniqueStrings.Length, #endif - leftComparer); - - HashSet? rightSet = null; - SubstringComparer? rightComparer = null; + comparer); // For each substring length... int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit); for (int count = 1; count <= maxSubstringLength; count++) { - leftComparer.Count = count; + comparer.IsLeft = true; + comparer.Count = count; // For each index, get a uniqueness factor for the left-justified substrings. // If any is above our threshold, we're done. for (int index = 0; index <= minLength - count; index++) { - leftComparer.Index = index; - double factor = GetUniquenessFactor(leftSet, uniqueStrings); - if (factor >= SufficientUniquenessFactor) + comparer.Index = index; + + if (HasSufficientUniquenessFactor(set, uniqueStrings)) { results = CreateAnalysisResults( uniqueStrings, ignoreCase, minLength, maxLength, index, count, @@ -86,18 +83,8 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign // right-justified substrings, and so we also check right-justification. if (minLength != maxLength) { - // Lazily-initialize the right-comparer/set state, as it's often not needed. - if (rightComparer is null) - { - rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer(); - rightSet = new HashSet( -#if NET6_0_OR_GREATER - uniqueStrings.Length, -#endif - rightComparer); - } - rightComparer.Count = count; - Debug.Assert(rightSet is not null); + // toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it) + comparer.IsLeft = false; // For each index, get a uniqueness factor for the right-justified substrings. // If any is above our threshold, we're done. @@ -105,12 +92,11 @@ private static bool TryUseSubstring(ReadOnlySpan uniqueStrings, bool ign { // Get a uniqueness factor for the right-justified substrings. // If it's above our threshold, we're done. - rightComparer.Index = -index - count; - double factor = GetUniquenessFactor(rightSet, uniqueStrings); - if (factor >= SufficientUniquenessFactor) + comparer.Index = -index - count; + if (HasSufficientUniquenessFactor(set, uniqueStrings)) { results = CreateAnalysisResults( - uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count, + uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count, static (string s, int index, int count) => s.AsSpan(s.Length + index, count)); return true; } @@ -235,15 +221,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan s) #endif } - private static double GetUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings) + private static bool HasSufficientUniquenessFactor(HashSet set, ReadOnlySpan uniqueStrings) { set.Clear(); + + // SufficientUniquenessFactor of 95% is good enough. + // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad. + int acceptableNonUniqueCount = uniqueStrings.Length / 20; + foreach (string s in uniqueStrings) { - set.Add(s); + if (!set.Add(s) && --acceptableNonUniqueCount < 0) + { + return false; + } } - return set.Count / (double)uniqueStrings.Length; + return true; } internal readonly struct AnalysisResults @@ -273,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer { public int Index; public int Count; + public bool IsLeft; public abstract bool Equals(string? x, string? y); public abstract int GetHashCode(string s); } - private sealed class LeftJustifiedSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count)); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count)); - } - - private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer - { - public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count)); - } - - private sealed class RightJustifiedSubstringComparer : SubstringComparer + private sealed class JustifiedSubstringComparer : SubstringComparer { - public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count)); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count)); + public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count)); + public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); } - private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer + private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer { - public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase); - public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count)); + public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase); + public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count)); } } } diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs index c48b7aeac1c73..4a83ac05485b0 100644 --- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs +++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs @@ -36,7 +36,7 @@ private LengthBucketsFrozenDictionary( } internal static LengthBucketsFrozenDictionary? CreateLengthBucketsFrozenDictionaryIfAppropriate( - Dictionary source, IEqualityComparer comparer, int minLength, int maxLength) + Dictionary source, IEqualityComparer comparer, int minLength, int maxLength, string[] keys) { Debug.Assert(source.Count != 0); Debug.Assert(comparer == EqualityComparer.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase); @@ -81,7 +81,6 @@ private LengthBucketsFrozenDictionary( return null; } - var keys = new string[source.Count]; var values = new TValue[keys.Length]; var lengthBuckets = new KeyValuePair[spread][];