Faster optimized frozen dictionary creation (1/n) (#87510)

* every strategy needs an array of keys, we can create it up-front and iterate over it rather than the dictionary to get min and max lengths (1-2% gain) * Instead of ensuring that at least 95% of data is good, we stop when we know that at least 5% is bad (13-14% gain) * toggle the direction and re-use the comparer and hashset (3% time gain, 12% allocations reduction)
dotnet · Jun 14, 2023 · 9b6bab4 · 9b6bab4
1 parent e082cc6
commit 9b6bab4
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 55 deletions.
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs
@@ -232,24 +232,26 @@ private static FrozenDictionary<TKey, TValue> ChooseImplementationOptimizedForRe
                 Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
                 IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
 
+                // this array is needed for every strategy
+                string[] entries = (string[])(object)source.Keys.ToArray();
+
                 // Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this.
                 int minLength = int.MaxValue, maxLength = 0;
-                foreach (KeyValuePair<string, TValue> kvp in stringEntries)
+                foreach (string key in entries)
                 {
-                    if (kvp.Key.Length < minLength) minLength = kvp.Key.Length;
-                    if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length;
+                    if (key.Length < minLength) minLength = key.Length;
+                    if (key.Length > maxLength) maxLength = key.Length;
                 }
                 Debug.Assert(minLength >= 0 && maxLength >= minLength);
 
                 // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
-                FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength);
+                FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength, entries);
                 if (frozenDictionary is not null)
                 {
                     return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
                 }
 
                 // Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys.
-                string[] entries = (string[])(object)source.Keys.ToArray();
                 KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
                 if (analysis.SubstringHashing)
                 {

diff --git a/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/...ibraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
@@ -46,32 +46,29 @@ public static AnalysisResults Analyze(
         /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
         private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
         {
-            const double SufficientUniquenessFactor = 0.95; // 95% is good enough
             const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
 
-            SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer();
-            HashSet<string> leftSet = new HashSet<string>(
+            SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
+            HashSet<string> set = new HashSet<string>(
 #if NET6_0_OR_GREATER
                 uniqueStrings.Length,
 #endif
-                leftComparer);
-
-            HashSet<string>? rightSet = null;
-            SubstringComparer? rightComparer = null;
+                comparer);
 
             // For each substring length...
             int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
             for (int count = 1; count <= maxSubstringLength; count++)
             {
-                leftComparer.Count = count;
+                comparer.IsLeft = true;
+                comparer.Count = count;
 
                 // For each index, get a uniqueness factor for the left-justified substrings.
                 // If any is above our threshold, we're done.
                 for (int index = 0; index <= minLength - count; index++)
                 {
-                    leftComparer.Index = index;
-                    double factor = GetUniquenessFactor(leftSet, uniqueStrings);
-                    if (factor >= SufficientUniquenessFactor)
+                    comparer.Index = index;
+
+                    if (HasSufficientUniquenessFactor(set, uniqueStrings))
                     {
                         results = CreateAnalysisResults(
                             uniqueStrings, ignoreCase, minLength, maxLength, index, count,
@@ -86,31 +83,20 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
                 // right-justified substrings, and so we also check right-justification.
                 if (minLength != maxLength)
                 {
-                    // Lazily-initialize the right-comparer/set state, as it's often not needed.
-                    if (rightComparer is null)
-                    {
-                        rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
-                        rightSet = new HashSet<string>(
-#if NET6_0_OR_GREATER
-                            uniqueStrings.Length,
-#endif
-                            rightComparer);
-                    }
-                    rightComparer.Count = count;
-                    Debug.Assert(rightSet is not null);
+                    // toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
+                    comparer.IsLeft = false;
 
                     // For each index, get a uniqueness factor for the right-justified substrings.
                     // If any is above our threshold, we're done.
                     for (int index = 0; index <= minLength - count; index++)
                     {
                         // Get a uniqueness factor for the right-justified substrings.
                         // If it's above our threshold, we're done.
-                        rightComparer.Index = -index - count;
-                        double factor = GetUniquenessFactor(rightSet, uniqueStrings);
-                        if (factor >= SufficientUniquenessFactor)
+                        comparer.Index = -index - count;
+                        if (HasSufficientUniquenessFactor(set, uniqueStrings))
                         {
                             results = CreateAnalysisResults(
-                                uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count,
+                                uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
                                 static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
                             return true;
                         }
@@ -235,15 +221,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
 #endif
         }
 
-        private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
+        private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
         {
             set.Clear();
+
+            // SufficientUniquenessFactor of 95% is good enough.
+            // Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
+            int acceptableNonUniqueCount = uniqueStrings.Length / 20;
+
             foreach (string s in uniqueStrings)
             {
-                set.Add(s);
+                if (!set.Add(s) && --acceptableNonUniqueCount < 0)
+                {
+                    return false;
+                }
             }
 
-            return set.Count / (double)uniqueStrings.Length;
+            return true;
         }
 
         internal readonly struct AnalysisResults
@@ -273,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer<string>
         {
             public int Index;
             public int Count;
+            public bool IsLeft;
             public abstract bool Equals(string? x, string? y);
             public abstract int GetHashCode(string s);
         }
 
-        private sealed class LeftJustifiedSubstringComparer : SubstringComparer
-        {
-            public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count));
-            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count));
-        }
-
-        private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
-        {
-            public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase);
-            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count));
-        }
-
-        private sealed class RightJustifiedSubstringComparer : SubstringComparer
+        private sealed class JustifiedSubstringComparer : SubstringComparer
         {
-            public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count));
-            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count));
+            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
+            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
         }
 
-        private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
+        private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
         {
-            public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase);
-            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count));
+            public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
+            public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
         }
     }
 }
diff --git a/...llections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs b/...llections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs
@@ -36,7 +36,7 @@ private LengthBucketsFrozenDictionary(
         }
 
         internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(
-            Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength)
+            Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength, string[] keys)
         {
             Debug.Assert(source.Count != 0);
             Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
@@ -81,7 +81,6 @@ private LengthBucketsFrozenDictionary(
                 return null;
             }
 
-            var keys = new string[source.Count];
             var values = new TValue[keys.Length];
             var lengthBuckets = new KeyValuePair<string, int>[spread][];