Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move the IsLeft/IsRight decision out of the loop and use computed substring set #88516

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
#if !NET8_0_OR_GREATER
using System.Runtime.CompilerServices;
#endif

namespace System.Collections.Frozen
{
Expand All @@ -31,86 +29,72 @@ internal static class KeyAnalyzer
public static AnalysisResults Analyze(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
{
Debug.Assert(!uniqueStrings.IsEmpty);
Debug.Assert(uniqueStrings.Length > 0);

// Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
AnalysisResults results;
if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
if (minLength > 0)
{
results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
}

return results;
}
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings

/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
// Sufficient uniqueness factor of 95% is good enough.
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
// Try to pick a substring comparer.
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
uniqueStrings.Length,
#endif
comparer);

// For each substring length...
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.IsLeft = true;
comparer.Count = count;
comparer);

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// For each substring length...preferring the shortest length that provides
// enough uniqueness
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.Index = index;
comparer.Count = count;

if (HasSufficientUniquenessFactor(set, uniqueStrings))
// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
static (string s, int index, int count) => s.AsSpan(index, count));
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
return true;
}
}
comparer.Index = index;

// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left- vs
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
comparer.IsLeft = false;
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, index, count);
}
}

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left- vs
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))
// when Index is negative, we're offsetting from the right, ensure we're at least
// far enough from the right that we have count characters available
comparer.Index = -count;
Copy link
Contributor Author

@IDisposable IDisposable Jul 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a real improvement. We do the math for setting a negative index (e.g. from the right side) starting with count characters from the right (as before). Then on line 81 we keep decrementing comparer.Index as we go on moving the "cursor" left.


// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int offset = 0; offset <= minLength - count; offset++, comparer.Index--)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
return true;
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count);
}
}
}
}
}

// Could not find a substring index/length that was good enough.
results = default;
return false;
// Could not find a substring index/length that was good enough, use the entire string.
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0);
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
}

private static AnalysisResults CreateAnalysisResults(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan)
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count)
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
{
// Start off by assuming all strings are ASCII
bool allAsciiIfIgnoreCase = true;
Expand All @@ -129,7 +113,7 @@ private static AnalysisResults CreateAnalysisResults(
foreach (string s in uniqueStrings)
{
// Get the span for the substring.
ReadOnlySpan<char> substring = getSubstringSpan(s, index, count);
ReadOnlySpan<char> substring = count == 0 ? s.AsSpan() : Slicer(s, index, count);
Copy link
Contributor Author

@IDisposable IDisposable Jul 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to re-slice the string again here. It would be awesome if we could have a HashSet<ReadOnlySpan<char> but that's not going to happen as those would be structs not objects.


// If the substring isn't ASCII, bail out to return the results.
if (!IsAllAscii(substring))
Expand Down Expand Up @@ -158,8 +142,6 @@ private static AnalysisResults CreateAnalysisResults(
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
}

private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);
IDisposable marked this conversation as resolved.
Show resolved Hide resolved

internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
{
#if NET8_0_OR_GREATER
Expand Down Expand Up @@ -221,17 +203,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
{
set.Clear();

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
if (!set.Add(s) && acceptableNonUniqueCount-- <= 0)
{
return false;
}
Expand Down Expand Up @@ -263,25 +241,34 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex
public bool RightJustifiedSubstring => HashIndex < 0;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ReadOnlySpan<char> Slicer(this string s, int index, int count) => s.AsSpan((index >= 0 ? index : s.Length + index), count);
Copy link
Contributor Author

@IDisposable IDisposable Jul 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ternary is the sole remaining conditional jump in the hot loop, but there's no good way to avoid that simultaneous avoiding delegate overhead so make it as simple as possible. The number of jumps (old vs. new) is identical, but this is a tiny tiny block of JITtable goodness.

Copy link
Contributor Author

@IDisposable IDisposable Jul 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tried running this as a jumpless method body, but it makes little difference and is harder to grok; passing fromRight = 0 for left, or fromRight = 1 for right which requires the SubstringComparers to carry the left/right multiplier with them (so more state...):

public static ReadOnlySpan<char> Slicer(this string s, byte fromRight, int index, int count) => s.AsSpan((s.Length * fromRight) + index), count);

Also, if we COULD swap the HashSet<string>'s comparer out for left vs. right the we could just have that knowledge embedded with a trait and thus fully jumpless, but that would require allocating two HashSet<string>s which might be an allocation regression nobody wants :(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did something like that in PR #89689 which is a huge win... since we can't swap the comparer, I ended up creating both a left and right HashSet with backing comparer that "hard codes" the left/right logic. HUGE WINS, see that PR.


private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index;
public int Count;
public bool IsLeft;
public int Index; // offset from left side (if positive) or right side (if negative) of the string
IDisposable marked this conversation as resolved.
Show resolved Hide resolved
public int Count; // number of characters in the span

public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class JustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).SequenceEqual(y!.Slicer(Index, Count));
Copy link
Contributor Author

@IDisposable IDisposable Jul 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Slicer does the left/right based on the sign of the Index value now, which should JIT down better.

I really wish that String.AsSpan understood the use of -1 start intrinsically as string.Length - 1... that would make the ternary unneeded.


[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.Slicer(Index, Count));
}

private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).Equals(y!.Slicer(Index, Count), StringComparison.OrdinalIgnoreCase);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.Slicer(Index, Count));
}
}
}