Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Several improvements / simplifications in Regex #100315

Merged
merged 3 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -391,11 +391,13 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
/// <summary>Adds a SearchValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictionary<string, string[]> requiredHelpers)
{
// SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
// Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
Debug.Assert(chars.Length is 4 or 5);
Debug.Assert(chars.Length > 3);

return RegexCharClass.IsAscii(chars)
// IndexOfAny(SearchValues) is faster than a regular IndexOfAny("abcd") if:
// - There are more than 5 characters in the needle, or
// - There are only 4 or 5 characters in the needle and they're all ASCII.

return chars.Length > 5 || RegexCharClass.IsAscii(chars)
? EmitSearchValues(chars.ToArray(), requiredHelpers)
: Literal(chars.ToString());
}
Expand Down Expand Up @@ -3510,11 +3512,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
{
if (iterationCount is null &&
node.Kind is RegexNodeKind.Notonelazy &&
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
literal.Range.LowInclusive == literal.Range.HighInclusive ||
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
{
Expand Down Expand Up @@ -3546,18 +3547,6 @@ literal.SetChars is not null ||
(false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});",
});
}
else if (literal.AsciiChars is not null) // set of only ASCII characters
{
char[] asciiChars = literal.AsciiChars;
overlap = asciiChars.Contains(node.Ch);
if (!overlap)
{
Debug.Assert(node.Ch < 128);
Array.Resize(ref asciiChars, asciiChars.Length + 1);
asciiChars[asciiChars.Length - 1] = node.Ch;
}
writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValues(asciiChars, requiredHelpers)});");
}
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
Expand Down Expand Up @@ -4928,11 +4917,10 @@ private static bool TryEmitIndexOf(
{
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;

Span<char> setChars = stackalloc char[5]; // current max that's vectorized
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);

// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
// IndexOfAny{Except}InRange
// Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
(highInclusive - lowInclusive) > 1)
{
string indexOfAnyInRangeName = !negated ?
"IndexOfAnyInRange" :
Expand All @@ -4944,13 +4932,15 @@ private static bool TryEmitIndexOf(
return true;
}

if (setCharsCount > 0)
// IndexOfAny{Except}(ch1, ...)
Span<char> setChars = stackalloc char[128];
setChars = setChars.Slice(0, RegexCharClass.GetSetChars(node.Str, setChars));
if (!setChars.IsEmpty)
{
(string indexOfName, string indexOfAnyName) = !negated ?
("IndexOf", "IndexOfAny") :
("IndexOfAnyExcept", "IndexOfAnyExcept");

setChars = setChars.Slice(0, setCharsCount);
indexOfExpr = setChars.Length switch
{
1 => $"{last}{indexOfName}({Literal(setChars[0])})",
Expand All @@ -4962,18 +4952,6 @@ private static bool TryEmitIndexOf(
literalLength = 1;
return true;
}

if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
{
string indexOfAnyName = !negated ?
"IndexOfAny" :
"IndexOfAnyExcept";

indexOfExpr = $"{last}{indexOfAnyName}({EmitSearchValues(asciiChars, requiredHelpers)})";

literalLength = 1;
return true;
}
}

indexOfExpr = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<EnableDefaultEmbeddedResourceItems>false</EnableDefaultEmbeddedResourceItems>
<UsingToolXliff>true</UsingToolXliff>
<CLSCompliant>false</CLSCompliant>
<NoWarn>$(NoWarn);CS0436;CS0649</NoWarn>
<NoWarn>$(NoWarn);CS0436;CS0649;CA1872</NoWarn>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<AnalyzerLanguage>cs</AnalyzerLanguage>
</PropertyGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,17 +815,23 @@ public static bool TryGetDoubleRange(
/// If 0 is returned, no assumptions can be made about the characters.
/// </returns>
/// <remarks>
/// Only considers character classes that only contain sets (no categories)
/// and no subtraction... just simple sets containing starting/ending pairs.
/// The returned characters may be negated: if IsNegated(set) is false, then
/// the returned characters are the only ones that match; if it returns true,
/// then the returned characters are the only ones that don't match.
/// Only considers character classes that only contain sets (no categories),
/// just simple sets containing starting/ending pairs (subtraction from those pairs
/// is factored in, however).The returned characters may be negated: if IsNegated(set)
/// is false, then the returned characters are the only ones that match; if it returns
/// true, then the returned characters are the only ones that don't match.
/// </remarks>
public static int GetSetChars(string set, Span<char> chars)
{
// We get the characters by enumerating the set portion, so we validate that it's
// set up to enable that, e.g. no categories.
if (!CanEasilyEnumerateSetContents(set))
if (!CanEasilyEnumerateSetContents(set, out bool hasSubtraction))
{
return 0;
}

// Negation with subtraction is too cumbersome to reason about efficiently.
if (hasSubtraction && IsNegated(set))
{
return 0;
}
Expand All @@ -837,40 +843,37 @@ public static int GetSetChars(string set, Span<char> chars)
// based on it a) complicating things, and b) it being really unlikely to
// be part of a small set.
int setLength = set[SetLengthIndex];
int count = 0;
int count = 0, evaluated = 0;
for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
{
int curSetEnd = set[i + 1];
for (int c = set[i]; c < curSetEnd; c++)
{
if (count >= chars.Length)
// Keep track of how many characters we've checked. This could work
// just comparing count rather than evaluated, but we also want to
// limit how much work is done here, which we can do by constraining
// the number of checks to the size of the storage provided.
if (++evaluated > chars.Length)
{
return 0;
}

// If the set is all ranges but has a subtracted class,
// validate the char is actually in the set prior to storing it:
// it might be in the subtracted range.
if (hasSubtraction && !CharInClass((char)c, set))
{
continue;
}

Debug.Assert(count <= evaluated);
chars[count++] = (char)c;
}
}

return count;
}

public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars)
{
Span<char> chars = stackalloc char[128];

chars = chars.Slice(0, GetSetChars(set, chars));

if (chars.IsEmpty || !IsAscii(chars))
{
asciiChars = null;
return false;
}

asciiChars = chars.ToArray();
return true;
}

/// <summary>
/// Determines whether two sets may overlap.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,7 @@ void EmitFixedSet_LeftToRight()
default:
// tmp = ...IndexOfAny(setChars);
// tmp = ...IndexOfAny(s_searchValues);
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(primarySet.Chars), except: primarySet.Negated);
EmitIndexOfAnyWithSearchValuesOrLiteral(primarySet.Chars, except: primarySet.Negated);
break;
}
}
Expand Down Expand Up @@ -3587,11 +3587,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
if (!rtl &&
iterationCount is null &&
node.Kind is RegexNodeKind.Notonelazy &&
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
(literal.String is not null ||
literal.SetChars is not null ||
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
literal.Range.LowInclusive == literal.Range.HighInclusive ||
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
{
Expand Down Expand Up @@ -3660,18 +3659,6 @@ literal.SetChars is not null ||
break;
}
}
else if (literal.AsciiChars is not null) // set of only ASCII characters
{
char[] asciiChars = literal.AsciiChars;
overlap = asciiChars.AsSpan().Contains(node.Ch);
if (!overlap)
{
Debug.Assert(node.Ch < 128);
Array.Resize(ref asciiChars, asciiChars.Length + 1);
asciiChars[^1] = node.Ch;
}
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars));
}
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
Expand Down Expand Up @@ -5153,21 +5140,9 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength)

if (node.IsSetFamily)
{
Span<char> setChars = stackalloc char[5]; // current max that's vectorized
int setCharsCount;
if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
{
literalLength = 1;
return true;
}

if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
{
literalLength = 1;
return true;
}

if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _))
Span<char> setChars = stackalloc char[128];
if (RegexCharClass.TryGetSingleRange(node.Str, out _, out _) ||
RegexCharClass.GetSetChars(node.Str, setChars) > 0)
{
literalLength = 1;
return true;
Expand Down Expand Up @@ -5218,26 +5193,11 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
{
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;

Span<char> setChars = stackalloc char[5]; // current max that's vectorized
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);

// IndexOfAny{Except}InRange
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
// Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
(highInclusive - lowInclusive) > 1)
{
if (lowInclusive == highInclusive)
{
Ldc(lowInclusive);
Call((useLast, negated) switch
{
(false, false) => s_spanIndexOfChar,
(false, true) => s_spanIndexOfAnyExceptChar,
(true, false) => s_spanLastIndexOfChar,
(true, true) => s_spanLastIndexOfAnyExceptChar,
});
return;
}

Ldc(lowInclusive);
Ldc(highInclusive);
Call((useLast, negated) switch
Expand All @@ -5251,6 +5211,8 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
}

// IndexOfAny{Except}(ch1, ...)
Span<char> setChars = stackalloc char[128]; // arbitrary cut-off that accomodates all of ASCII and doesn't take too long to compute
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
if (setCharsCount > 0)
{
setChars = setChars.Slice(0, setCharsCount);
Expand Down Expand Up @@ -5293,17 +5255,10 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
return;

default:
EmitIndexOfAnyWithSearchValuesOrLiteral(setChars.ToString(), last: useLast, except: negated);
EmitIndexOfAnyWithSearchValuesOrLiteral(setChars, last: useLast, except: negated);
return;
}
}

// IndexOfAny{Except}(SearchValues<char>)
if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
{
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars), last: useLast, except: negated);
return;
}
}

Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered.");
Expand Down Expand Up @@ -6197,15 +6152,15 @@ private void EmitTimeoutCheckIfNeeded()
}

/// <summary>Emits a call to either IndexOfAny("abcd") or IndexOfAny(SearchValues) depending on the <paramref name="chars"/>.</summary>
private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = false, bool except = false)
private void EmitIndexOfAnyWithSearchValuesOrLiteral(ReadOnlySpan<char> chars, bool last = false, bool except = false)
{
Debug.Assert(chars.Length > 3);
Debug.Assert(chars.Length > 3, $"chars.Length == {chars.Length}");

// SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
// Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
if (chars.Length is 4 or 5 && !RegexCharClass.IsAscii(chars))
{
Ldstr(chars);
Ldstr(chars.ToString());
Call(s_stringAsSpanMethod);
Call((last, except) switch
{
Expand All @@ -6217,7 +6172,7 @@ private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = f
}
else
{
LoadSearchValues(chars.ToCharArray());
LoadSearchValues(chars.ToArray());
Call((last, except) switch
{
(false, false) => s_spanIndexOfAnySearchValues,
Expand Down
Loading