Skip to content

Commit

Permalink
Faster optimized frozen dictionary creation (1/n) (#87510)
Browse files Browse the repository at this point in the history
* every strategy needs an array of keys, we can create it up-front and iterate over it rather than the dictionary to get min and max lengths (1-2% gain)

* Instead of ensuring that at least 95% of data is good, we stop when we know that at least 5% is bad (13-14% gain)

* toggle the direction and re-use the comparer and hashset (3% time gain, 12% allocations reduction)
  • Loading branch information
adamsitnik authored Jun 14, 2023
1 parent e082cc6 commit 9b6bab4
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -232,24 +232,26 @@ private static FrozenDictionary<TKey, TValue> ChooseImplementationOptimizedForRe
Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;

// this array is needed for every strategy
string[] entries = (string[])(object)source.Keys.ToArray();

// Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this.
int minLength = int.MaxValue, maxLength = 0;
foreach (KeyValuePair<string, TValue> kvp in stringEntries)
foreach (string key in entries)
{
if (kvp.Key.Length < minLength) minLength = kvp.Key.Length;
if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length;
if (key.Length < minLength) minLength = key.Length;
if (key.Length > maxLength) maxLength = key.Length;
}
Debug.Assert(minLength >= 0 && maxLength >= minLength);

// Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength);
FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength, entries);
if (frozenDictionary is not null)
{
return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
}

// Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys.
string[] entries = (string[])(object)source.Keys.ToArray();
KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
if (analysis.SubstringHashing)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,32 +46,29 @@ public static AnalysisResults Analyze(
/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const double SufficientUniquenessFactor = 0.95; // 95% is good enough
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings

SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer();
HashSet<string> leftSet = new HashSet<string>(
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
leftComparer);

HashSet<string>? rightSet = null;
SubstringComparer? rightComparer = null;
comparer);

// For each substring length...
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
leftComparer.Count = count;
comparer.IsLeft = true;
comparer.Count = count;

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
leftComparer.Index = index;
double factor = GetUniquenessFactor(leftSet, uniqueStrings);
if (factor >= SufficientUniquenessFactor)
comparer.Index = index;

if (HasSufficientUniquenessFactor(set, uniqueStrings))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
Expand All @@ -86,31 +83,20 @@ private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ign
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// Lazily-initialize the right-comparer/set state, as it's often not needed.
if (rightComparer is null)
{
rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
rightSet = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
rightComparer);
}
rightComparer.Count = count;
Debug.Assert(rightSet is not null);
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
comparer.IsLeft = false;

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
rightComparer.Index = -index - count;
double factor = GetUniquenessFactor(rightSet, uniqueStrings);
if (factor >= SufficientUniquenessFactor)
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count,
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
return true;
}
Expand Down Expand Up @@ -235,15 +221,23 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
{
set.Clear();

// SufficientUniquenessFactor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
set.Add(s);
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
{
return false;
}
}

return set.Count / (double)uniqueStrings.Length;
return true;
}

internal readonly struct AnalysisResults
Expand Down Expand Up @@ -273,32 +267,21 @@ private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index;
public int Count;
public bool IsLeft;
public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class LeftJustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).SequenceEqual(y.AsSpan(Index, Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(Index, Count));
}

private sealed class LeftJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(Index, Count).Equals(y.AsSpan(Index, Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(Index, Count));
}

private sealed class RightJustifiedSubstringComparer : SubstringComparer
private sealed class JustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).SequenceEqual(y.AsSpan(y!.Length + Index, Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(s.Length + Index, Count));
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}

private sealed class RightJustifiedCaseInsensitiveSubstringComparer : SubstringComparer
private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(x!.Length + Index, Count).Equals(y.AsSpan(y!.Length + Index, Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(s.Length + Index, Count));
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ private LengthBucketsFrozenDictionary(
}

internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(
Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength)
Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength, string[] keys)
{
Debug.Assert(source.Count != 0);
Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
Expand Down Expand Up @@ -81,7 +81,6 @@ private LengthBucketsFrozenDictionary(
return null;
}

var keys = new string[source.Count];
var values = new TValue[keys.Length];
var lengthBuckets = new KeyValuePair<string, int>[spread][];

Expand Down

0 comments on commit 9b6bab4

Please sign in to comment.