Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Match and isMatch ReadOnlySpan<char> overloads as well as Regex protected methods which expose ReadOnlySpan<char> to avoid allocations. #62509

Closed
wants to merge 6 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,15 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", rm.Code); // skip implicit root capture
writer.WriteLine();

writer.WriteLine($" protected override bool FindFirstChar()");
writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan<char> text)");
writer.WriteLine($" {{");
writer.Indent += 4;
EmitScan(writer, rm, id);
writer.Indent -= 4;
writer.WriteLine($" }}");
writer.WriteLine();

writer.WriteLine($" private bool FindFirstChar(global::System.ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 4;
RequiredHelperFunctions requiredHelpers = EmitFindFirstChar(writer, rm, id);
Expand All @@ -233,7 +241,7 @@ private static ImmutableArray<Diagnostic> EmitRegexMethod(IndentedTextWriter wri
{
writer.WriteLine($" [global::System.Runtime.CompilerServices.SkipLocalsInit]");
}
writer.WriteLine($" protected override void Go()");
writer.WriteLine($" private bool Go(global::System.ReadOnlySpan<char> inputSpan)");
writer.WriteLine($" {{");
writer.Indent += 4;
requiredHelpers |= EmitGo(writer, rm, id);
Expand Down Expand Up @@ -299,6 +307,43 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht)
}
}

private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id)
{
using (EmitBlock(writer, "while (true)"))
{
using (EmitBlock(writer, "if (FindFirstChar(text))"))
{
if (rm.MatchTimeout != int.MaxValue)
joperezr marked this conversation as resolved.
Show resolved Hide resolved
{
writer.WriteLine("base.CheckTimeout();");
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
writer.WriteLine();
}

writer.WriteLine("// If we got a match, we're done.");
using (EmitBlock(writer, "if (Go(text))"))
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
writer.WriteLine("return;");
}
writer.WriteLine();

writer.WriteLine("// Reset state for another iteration.");
writer.WriteLine("base.runtrackpos = base.runtrack!.Length;");
writer.WriteLine("base.runstackpos = base.runstack!.Length;");
writer.WriteLine("base.runcrawlpos = base.runcrawl!.Length;");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are any of these actually necessary? runcrawlpos should be 0 because if there were captures we would have uncaptured all the way back to 0 on a failed match. We don't use runtrack at all. I'd have expected runtrackpos to also end up at runtrack.Length as well, as we use it as a backtracking stack which should get fully unwound.

}
writer.WriteLine();

writer.WriteLine("// We failed to find a match. If we're at the end of the input, then we are done.");
using (EmitBlock(writer, "if (base.runtextpos == text.Length)"))
{
writer.WriteLine("return;");
}
writer.WriteLine();

writer.WriteLine("base.runtextpos++;");
}
}

/// <summary>Emits the body of the FindFirstChar override.</summary>
private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, string id)
{
Expand Down Expand Up @@ -347,7 +392,6 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ
{
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix);
break;

Expand All @@ -356,13 +400,11 @@ private static RequiredHelperFunctions EmitFindFirstChar(IndentedTextWriter writ
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitFixedSet();
break;

case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive:
Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null);
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitLiteralAfterAtomicLoop();
break;

Expand Down Expand Up @@ -463,7 +505,6 @@ bool EmitAnchors()
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
writer.WriteLine("// Beginning-of-line anchor");
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
additionalDeclarations.Add("int beginning = base.runtextbeg;");
using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')"))
{
Expand Down Expand Up @@ -763,13 +804,15 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};");
writer.WriteLine("base.Capture(0, start, end);");
writer.WriteLine("base.runtextpos = end;");
writer.WriteLine("return true;");
return requiredHelpers;

case RegexNodeKind.Empty:
// This case isn't common in production, but it's very common when first getting started with the
// source generator and seeing what happens as you add more to expressions. When approaching
// it from a learning perspective, this is very common, as it's the empty string you start with.
writer.WriteLine("base.Capture(0, base.runtextpos, base.runtextpos);");
writer.WriteLine("return true;");
return requiredHelpers;
}

Expand All @@ -781,7 +824,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe

// Declare some locals.
string sliceSpan = "slice";
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.WriteLine($"int original_pos = pos;");
bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
Expand Down Expand Up @@ -826,7 +868,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
}
writer.WriteLine("base.runtextpos = pos;");
writer.WriteLine("base.Capture(0, original_pos, pos);");
writer.WriteLine("return;");
writer.WriteLine("return true;");
writer.WriteLine();

// We only get here in the code if the whole expression fails to match and jumps to
Expand All @@ -837,6 +879,7 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
{
EmitUncaptureUntil("0");
}
writer.WriteLine("return false;");

// We're done with the match.

Expand All @@ -846,8 +889,6 @@ private static RequiredHelperFunctions EmitGo(IndentedTextWriter writer, RegexMe
// And emit any required helpers.
if (additionalLocalFunctions.Count != 0)
{
writer.WriteLine("return;"); // not strictly necessary, just for readability

foreach (KeyValuePair<string, string[]> localFunctions in additionalLocalFunctions.OrderBy(k => k.Key))
{
writer.WriteLine();
Expand Down Expand Up @@ -2148,7 +2189,7 @@ void EmitBoundary(RegexNode node)
_ => "base.IsECMABoundary",
};

using (EmitBlock(writer, $"if ({call}(pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))"))
using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}, base.runtextbeg, end))"))
{
writer.WriteLine($"goto {doneLabel};");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila
public string GroupNameFromNumber(int i) { throw null; }
public int GroupNumberFromName(string name) { throw null; }
protected void InitializeReferences() { }
public bool IsMatch(System.ReadOnlySpan<char> input) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; }
public static bool IsMatch(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; }
public bool IsMatch(string input) { throw null; }
public bool IsMatch(string input, int startat) { throw null; }
public static bool IsMatch(string input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex)] string pattern) { throw null; }
Expand Down Expand Up @@ -330,17 +334,20 @@ protected void DoubleCrawl() { }
protected void DoubleStack() { }
protected void DoubleTrack() { }
protected void EnsureStorage() { }
protected abstract bool FindFirstChar();
protected abstract void Go();
protected abstract void InitTrackCount();
protected virtual bool FindFirstChar() { throw null; }
protected virtual void Go() { throw null; }
protected virtual void InitTrackCount() { throw null; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we delete our generated override of InitTrackCount in the source generator? What breaks if runtrackcount remains 0? I think that'll just mean that we end up with the default minimums enforced by InitializeForGo (or whatever it's now called). We should probably delete the override and then confirm the defaults make sense, and/or determine whether there's a different number we could be computing that would help presize this better. If memory serves, right now it's yielding the number of opcodes produced by RegexWriter that are backtracking... but that's no longer particularly relevant to how we use runtrack, and we don't use runstack at all... so there's a fair bit of waste here to be cleaned up.

protected bool IsBoundary(int index, int startpos, int endpos) { throw null; }
protected bool IsBoundary(System.ReadOnlySpan<char> inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines
protected bool IsECMABoundary(int index, int startpos, int endpos) { throw null; }
protected bool IsECMABoundary(System.ReadOnlySpan<char> inputSpan, int index, int startpos, int endpos) { throw null; } // -> This is just temporary on the prototype. Method will be emitted by the generator engines
protected bool IsMatched(int cap) { throw null; }
protected int MatchIndex(int cap) { throw null; }
protected int MatchLength(int cap) { throw null; }
protected int Popcrawl() { throw null; }
protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick) { throw null; }
protected internal System.Text.RegularExpressions.Match? Scan(System.Text.RegularExpressions.Regex regex, string text, int textbeg, int textend, int textstart, int prevlen, bool quick, System.TimeSpan timeout) { throw null; }
protected internal virtual void Scan(System.ReadOnlySpan<char> text) { throw null; }
protected void TransferCapture(int capnum, int uncapnum, int start, int end) { }
protected void Uncapture() { }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace System.Text.RegularExpressions
/// </summary>
public class Capture
{
internal Capture(string text, int index, int length)
internal Capture(string? text, int index, int length)
{
Text = text;
Index = index;
Expand All @@ -19,27 +19,38 @@ internal Capture(string text, int index, int length)
/// <summary>Returns the position in the original string where the first character of captured substring was found.</summary>
public int Index { get; private protected set; }

/// <summary>
/// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of
/// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices
/// to account for the original text beginning.
/// </summary>
/// <param name="beginning">The original text's beginning offset.</param>
internal void AddBeginningToIndex(int beginning)
{
Index += beginning;
}

/// <summary>Returns the length of the captured substring.</summary>
public int Length { get; private protected set; }

/// <summary>The original string</summary>
internal string Text { get; set; }
internal string? Text { get; set; }

/// <summary>Gets the captured substring from the input string.</summary>
/// <value>The substring that is captured by the match.</value>
public string Value => Text.Substring(Index, Length);
public string Value => Text is string text ? text.Substring(Index, Length) : string.Empty;

/// <summary>Gets the captured span from the input string.</summary>
/// <value>The span that is captured by the match.</value>
public ReadOnlySpan<char> ValueSpan => Text.AsSpan(Index, Length);
public ReadOnlySpan<char> ValueSpan => Text is string text ? text.AsSpan(Index, Length) : ReadOnlySpan<char>.Empty;

/// <summary>Returns the substring that was matched.</summary>
public override string ToString() => Value;

/// <summary>The substring to the left of the capture</summary>
internal ReadOnlyMemory<char> GetLeftSubstring() => Text.AsMemory(0, Index);
internal ReadOnlyMemory<char> GetLeftSubstring() => Text is string text ? text.AsMemory(0, Index) : ReadOnlyMemory<char>.Empty;

/// <summary>The substring to the right of the capture</summary>
internal ReadOnlyMemory<char> GetRightSubstring() => Text.AsMemory(Index + Length, Text.Length - Index - Length);
internal ReadOnlyMemory<char> GetRightSubstring() => Text is string text ? text.AsMemory(Index + Length, Text.Length - Index - Length) : ReadOnlyMemory<char>.Empty;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,18 @@ namespace System.Text.RegularExpressions
{
internal sealed class CompiledRegexRunner : RegexRunner
{
private readonly Action<RegexRunner> _goMethod;
private readonly Func<RegexRunner, bool> _findFirstCharMethod;
private readonly ScanDelegate _scanMethod;

public CompiledRegexRunner(Action<RegexRunner> go, Func<RegexRunner, bool> findFirstChar, int trackCount)
internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan<char> text);

public CompiledRegexRunner(ScanDelegate scan, int trackCount)
{
_goMethod = go;
_findFirstCharMethod = findFirstChar;
_scanMethod = scan;
runtrackcount = trackCount;
}

protected override void Go() => _goMethod(this);

protected override bool FindFirstChar() => _findFirstCharMethod(this);
protected internal override void Scan(ReadOnlySpan<char> text)
=> _scanMethod(this, text);

protected override void InitTrackCount() { }
joperezr marked this conversation as resolved.
Show resolved Hide resolved
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,21 @@ namespace System.Text.RegularExpressions
{
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
{
private readonly DynamicMethod _goMethod;
private readonly DynamicMethod _findFirstCharMethod;
private readonly DynamicMethod _scanMethod;
private readonly int _trackcount;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As with the source generator, it'd be nice to see whether we can just delete _trackcount, the setting of it, etc. That'll contribute to #62450.


// Delegates are lazily created to avoid forcing JIT'ing until the regex is actually executed.
private Action<RegexRunner>? _go;
private Func<RegexRunner, bool>? _findFirstChar;
private CompiledRegexRunner.ScanDelegate? _scan;

public CompiledRegexRunnerFactory(DynamicMethod goMethod, DynamicMethod findFirstCharMethod, int trackcount)
public CompiledRegexRunnerFactory(DynamicMethod scanMethod, int trackcount)
{
_goMethod = goMethod;
_findFirstCharMethod = findFirstCharMethod;
_scanMethod = scanMethod;
_trackcount = trackcount;
}

protected internal override RegexRunner CreateInstance() =>
new CompiledRegexRunner(
_go ??= _goMethod.CreateDelegate<Action<RegexRunner>>(),
_findFirstChar ??= _findFirstCharMethod.CreateDelegate<Func<RegexRunner, bool>>(),
_scan ??= _scanMethod.CreateDelegate<CompiledRegexRunner.ScanDelegate>(),
_trackcount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public class Group : Capture
internal int _capcount;
internal CaptureCollection? _capcoll;

internal Group(string text, int[] caps, int capcount, string name)
internal Group(string? text, int[] caps, int capcount, string name)
: base(text, capcount == 0 ? 0 : caps[(capcount - 1) * 2], capcount == 0 ? 0 : caps[(capcount * 2) - 1])
{
_caps = caps;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public class Match : Group
internal bool _balancing; // whether we've done any balancing with this match. If we
// have done balancing, we'll need to do extra work in Tidy().

internal Match(Regex? regex, int capcount, string text, int begpos, int len, int startpos) :
internal Match(Regex? regex, int capcount, string? text, int begpos, int len, int startpos) :
base(text, new int[2], 0, "0")
{
_regex = regex;
Expand All @@ -66,7 +66,7 @@ internal Match(Regex? regex, int capcount, string text, int begpos, int len, int
/// <summary>Returns an empty Match object.</summary>
public static Match Empty { get; } = new Match(null, 1, string.Empty, 0, 0, 0);

internal void Reset(Regex regex, string text, int textbeg, int textend, int textstart)
internal void Reset(Regex regex, string? text, int textbeg, int textend, int textstart)
{
_regex = regex;
Text = text;
Expand Down Expand Up @@ -94,6 +94,7 @@ internal void Reset(Regex regex, string text, int textbeg, int textend, int text
public Match NextMatch()
{
Regex? r = _regex;
Debug.Assert(Text != null);
return r != null ?
r.Run(false, Length, Text, _textbeg, _textend - _textbeg, _textpos)! :
this;
Expand Down Expand Up @@ -338,7 +339,7 @@ internal sealed class MatchSparse : Match
{
private new readonly Hashtable _caps;

internal MatchSparse(Regex regex, Hashtable caps, int capcount, string text, int begpos, int len, int startpos) :
internal MatchSparse(Regex regex, Hashtable caps, int capcount, string? text, int begpos, int len, int startpos) :
base(regex, capcount, text, begpos, len, startpos)
{
_caps = caps;
Expand Down
Loading