dotnet · stephentoub · Apr 3, 2024 · Mar 26, 2024 · Mar 29, 2024 · Apr 3, 2024
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -391,11 +391,13 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
         /// <summary>Adds a SearchValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
         private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictionary<string, string[]> requiredHelpers)
         {
-            // SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
-            // Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
-            Debug.Assert(chars.Length is 4 or 5);
+            Debug.Assert(chars.Length > 3);
 
-            return RegexCharClass.IsAscii(chars)
+            // IndexOfAny(SearchValues) is faster than a regular IndexOfAny("abcd") if:
+            // - There are more than 5 characters in the needle, or
+            // - There are only 4 or 5 characters in the needle and they're all ASCII.
+
+            return (chars.Length > 5 || (chars.Length > 3 && RegexCharClass.IsAscii(chars)))
                 ? EmitSearchValues(chars.ToArray(), requiredHelpers)
                 : Literal(chars.ToString());
         }
@@ -3510,11 +3512,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
                 {
                     if (iterationCount is null &&
                         node.Kind is RegexNodeKind.Notonelazy &&
-                        subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
+                        subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
                         !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
                         (literal.String is not null ||
                          literal.SetChars is not null ||
-                         (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
                          literal.Range.LowInclusive == literal.Range.HighInclusive ||
                          (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
                     {
@@ -3546,18 +3547,6 @@ literal.SetChars is not null ||
                                 (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});",
                             });
                         }
-                        else if (literal.AsciiChars is not null) // set of only ASCII characters
-                        {
-                            char[] asciiChars = literal.AsciiChars;
-                            overlap = asciiChars.Contains(node.Ch);
-                            if (!overlap)
-                            {
-                                Debug.Assert(node.Ch < 128);
-                                Array.Resize(ref asciiChars, asciiChars.Length + 1);
-                                asciiChars[asciiChars.Length - 1] = node.Ch;
-                            }
-                            writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValues(asciiChars, requiredHelpers)});");
-                        }
                         else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
                         {
                             overlap = literal.Range.LowInclusive == node.Ch;
@@ -4928,11 +4917,10 @@ private static bool TryEmitIndexOf(
             {
                 bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
 
-                Span<char> setChars = stackalloc char[5]; // current max that's vectorized
-                int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
-
-                // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
-                if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+                // IndexOfAny{Except}InRange
+                // Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
+                if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
+                    (highInclusive - lowInclusive) > 1)
                 {
                     string indexOfAnyInRangeName = !negated ?
                         "IndexOfAnyInRange" :
@@ -4944,13 +4932,15 @@ private static bool TryEmitIndexOf(
                     return true;
                 }
 
-                if (setCharsCount > 0)
+                // IndexOfAny{Except}(ch1, ...)
+                Span<char> setChars = stackalloc char[128];
+                setChars = setChars.Slice(0, RegexCharClass.GetSetChars(node.Str, setChars));
+                if (!setChars.IsEmpty)
                 {
                     (string indexOfName, string indexOfAnyName) = !negated ?
                         ("IndexOf", "IndexOfAny") :
                         ("IndexOfAnyExcept", "IndexOfAnyExcept");
 
-                    setChars = setChars.Slice(0, setCharsCount);
                     indexOfExpr = setChars.Length switch
                     {
                         1 => $"{last}{indexOfName}({Literal(setChars[0])})",
@@ -4962,18 +4952,6 @@ private static bool TryEmitIndexOf(
                     literalLength = 1;
                     return true;
                 }
-
-                if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
-                {
-                    string indexOfAnyName = !negated ?
-                        "IndexOfAny" :
-                        "IndexOfAnyExcept";
-
-                    indexOfExpr = $"{last}{indexOfAnyName}({EmitSearchValues(asciiChars, requiredHelpers)})";
-
-                    literalLength = 1;
-                    return true;
-                }
             }
 
             indexOfExpr = null;

diff --git a/...raries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/...raries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj
@@ -11,7 +11,7 @@
     <EnableDefaultEmbeddedResourceItems>false</EnableDefaultEmbeddedResourceItems>
     <UsingToolXliff>true</UsingToolXliff>
     <CLSCompliant>false</CLSCompliant>
-    <NoWarn>$(NoWarn);CS0436;CS0649</NoWarn>
+    <NoWarn>$(NoWarn);CS0436;CS0649;CA1872</NoWarn>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <AnalyzerLanguage>cs</AnalyzerLanguage>
   </PropertyGroup>

diff --git a/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/...aries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -815,17 +815,23 @@ public static bool TryGetDoubleRange(
         /// If 0 is returned, no assumptions can be made about the characters.
         /// </returns>
         /// <remarks>
-        /// Only considers character classes that only contain sets (no categories)
-        /// and no subtraction... just simple sets containing starting/ending pairs.
-        /// The returned characters may be negated: if IsNegated(set) is false, then
-        /// the returned characters are the only ones that match; if it returns true,
-        /// then the returned characters are the only ones that don't match.
+        /// Only considers character classes that only contain sets (no categories),
+        /// just simple sets containing starting/ending pairs (subtraction from those pairs
+        /// is factored in, however).The returned characters may be negated: if IsNegated(set)
+        /// is false, then the returned characters are the only ones that match; if it returns
+        /// true, then the returned characters are the only ones that don't match.
         /// </remarks>
         public static int GetSetChars(string set, Span<char> chars)
         {
             // We get the characters by enumerating the set portion, so we validate that it's
             // set up to enable that, e.g. no categories.
-            if (!CanEasilyEnumerateSetContents(set))
+            if (!CanEasilyEnumerateSetContents(set, out bool hasSubtraction))
+            {
+                return 0;
+            }
+
+            // Negation with subtraction is too cumbersome to reason about efficiently.
+            if (hasSubtraction && IsNegated(set))
             {
                 return 0;
             }
@@ -837,40 +843,37 @@ public static int GetSetChars(string set, Span<char> chars)
             // based on it a) complicating things, and b) it being really unlikely to
             // be part of a small set.
             int setLength = set[SetLengthIndex];
-            int count = 0;
+            int count = 0, evaluated = 0;
             for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
             {
                 int curSetEnd = set[i + 1];
                 for (int c = set[i]; c < curSetEnd; c++)
                 {
-                    if (count >= chars.Length)
+                    // Keep track of how many characters we've checked. This could work
+                    // just comparing count rather than evaluated, but we also want to
+                    // limit how much work is done here, which we can do by constraining
+                    // the number of checks to the size of the storage provided.
+                    if (++evaluated > chars.Length)
                     {
                         return 0;
                     }
 
+                    // If the set is all ranges but has a subtracted class,
+                    // validate the char is actually in the set prior to storing it:
+                    // it might be in the subtracted range.
+                    if (hasSubtraction && !CharInClass((char)c, set))
+                    {
+                        continue;
+                    }
+
+                    Debug.Assert(count <= evaluated);
                     chars[count++] = (char)c;
                 }
             }
 
             return count;
         }
 
-        public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars)
-        {
-            Span<char> chars = stackalloc char[128];
-
-            chars = chars.Slice(0, GetSetChars(set, chars));
-
-            if (chars.IsEmpty || !IsAscii(chars))
-            {
-                asciiChars = null;
-                return false;
-            }
-
-            asciiChars = chars.ToArray();
-            return true;
-        }
-
         /// <summary>
         /// Determines whether two sets may overlap.
         /// </summary>

diff --git a/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/...raries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -939,7 +939,7 @@ void EmitFixedSet_LeftToRight()
                             default:
                                 // tmp = ...IndexOfAny(setChars);
                                 // tmp = ...IndexOfAny(s_searchValues);
-                                EmitIndexOfAnyWithSearchValuesOrLiteral(new string(primarySet.Chars), except: primarySet.Negated);
+                                EmitIndexOfAnyWithSearchValuesOrLiteral(primarySet.Chars, except: primarySet.Negated);
                                 break;
                         }
                     }
@@ -3587,11 +3587,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
                 if (!rtl &&
                     iterationCount is null &&
                     node.Kind is RegexNodeKind.Notonelazy &&
-                    subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
+                    subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
                     !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
                     (literal.String is not null ||
                      literal.SetChars is not null ||
-                     (literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
                      literal.Range.LowInclusive == literal.Range.HighInclusive ||
                      (literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
                 {
@@ -3660,18 +3659,6 @@ literal.SetChars is not null ||
                                 break;
                         }
                     }
-                    else if (literal.AsciiChars is not null) // set of only ASCII characters
-                    {
-                        char[] asciiChars = literal.AsciiChars;
-                        overlap = asciiChars.AsSpan().Contains(node.Ch);
-                        if (!overlap)
-                        {
-                            Debug.Assert(node.Ch < 128);
-                            Array.Resize(ref asciiChars, asciiChars.Length + 1);
-                            asciiChars[^1] = node.Ch;
-                        }
-                        EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars));
-                    }
                     else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
                     {
                         overlap = literal.Range.LowInclusive == node.Ch;
@@ -5153,21 +5140,9 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength)
 
                 if (node.IsSetFamily)
                 {
-                    Span<char> setChars = stackalloc char[5]; // current max that's vectorized
-                    int setCharsCount;
-                    if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
-                    {
-                        literalLength = 1;
-                        return true;
-                    }
-
-                    if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
-                    {
-                        literalLength = 1;
-                        return true;
-                    }
-
-                    if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _))
+                    Span<char> setChars = stackalloc char[128];
+                    if (RegexCharClass.TryGetSingleRange(node.Str, out _, out _) ||
+                        RegexCharClass.GetSetChars(node.Str, setChars) > 0)
                     {
                         literalLength = 1;
                         return true;
@@ -5218,26 +5193,11 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
                 {
                     bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
 
-                    Span<char> setChars = stackalloc char[5]; // current max that's vectorized
-                    int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
-
                     // IndexOfAny{Except}InRange
-                    // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
-                    if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+                    // Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
+                    if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
+                        (highInclusive - lowInclusive) > 1)
                     {
-                        if (lowInclusive == highInclusive)
-                        {
-                            Ldc(lowInclusive);
-                            Call((useLast, negated) switch
-                            {
-                                (false, false) => s_spanIndexOfChar,
-                                (false, true) => s_spanIndexOfAnyExceptChar,
-                                (true, false) => s_spanLastIndexOfChar,
-                                (true, true) => s_spanLastIndexOfAnyExceptChar,
-                            });
-                            return;
-                        }
-
                         Ldc(lowInclusive);
                         Ldc(highInclusive);
                         Call((useLast, negated) switch
@@ -5251,6 +5211,8 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
                     }
 
                     // IndexOfAny{Except}(ch1, ...)
+                    Span<char> setChars = stackalloc char[128]; // arbitrary cut-off that accomodates all of ASCII and doesn't take too long to compute
+                    int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
                     if (setCharsCount > 0)
                     {
                         setChars = setChars.Slice(0, setCharsCount);
@@ -5293,17 +5255,10 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
                                 return;
 
                             default:
-                                EmitIndexOfAnyWithSearchValuesOrLiteral(setChars.ToString(), last: useLast, except: negated);
+                                EmitIndexOfAnyWithSearchValuesOrLiteral(setChars, last: useLast, except: negated);
                                 return;
                         }
                     }
-
-                    // IndexOfAny{Except}(SearchValues<char>)
-                    if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
-                    {
-                        EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars), last: useLast, except: negated);
-                        return;
-                    }
                 }
 
                 Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered.");
@@ -6197,15 +6152,15 @@ private void EmitTimeoutCheckIfNeeded()
         }
 
         /// <summary>Emits a call to either IndexOfAny("abcd") or IndexOfAny(SearchValues) depending on the <paramref name="chars"/>.</summary>
-        private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = false, bool except = false)
+        private void EmitIndexOfAnyWithSearchValuesOrLiteral(ReadOnlySpan<char> chars, bool last = false, bool except = false)
         {
-            Debug.Assert(chars.Length > 3);
+            Debug.Assert(chars.Length > 3, $"chars.Length == {chars.Length}");
 
             // SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
             // Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
             if (chars.Length is 4 or 5 && !RegexCharClass.IsAscii(chars))
             {
-                Ldstr(chars);
+                Ldstr(chars.ToString());
                 Call(s_stringAsSpanMethod);
                 Call((last, except) switch
                 {
@@ -6217,7 +6172,7 @@ private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = f
             }
             else
             {
-                LoadSearchValues(chars.ToCharArray());
+                LoadSearchValues(chars.ToArray());
                 Call((last, except) switch
                 {
                     (false, false) => s_spanIndexOfAnySearchValues,