From ce6bd7be161636ada8d384de4f12b94c86e2b0c2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 29 Aug 2022 13:04:08 -0400 Subject: [PATCH] syntax: fix HIR printer This fixes some corner cases in the HIR printer where it would print the concrete syntax of a regex that does not match the natural interpretation of the HIR. One such example of this is: concat(a, alt(b, c)) This would get printed as ab|c But clearly, it should be printed as: a(?:b|c) The issue here is that the printer only considers the current HirKind when determining how to print it. Sometimes a group is needed to print an alt (and even a concat, in the case of 'rep(+, concat(a, b))'), but sometimes it isn't. We could address this in a few different ways: 1) Always print concats and alts inside a non-capturing group. 2) Make the printer detect precisely the cases where a non-capturing group is needed. 3) Make the HIR smart constructors insert non-capturing groups when needed. 4) Do some other thing to change the HIR to prevent these sorts of things by construction. This patch goes with (1). The reason in favor of it is that HIR printer was always about printing an equivalent regex and never about trying to print a "nice" regex. Indeed, the HIR printer can't print a nice regex, because the HIR represents a rigorously simplifed view of a regex to make analysis easier. (The most obvious such example are Unicode character classes. For example, the HIR printer never prints '\w'.) So inserting some extra groups (which it already does) even when they aren't strictly needed is perfectly okay. But still, it's useful to say why we didn't do the other choices: 2) Modifying the printer to only print groups when they're actually needed is pretty difficult. I tried this briefly, and handling this case requires some notion of what the parent expression is. This winds up being a possible but hairy change. 3) Making the HIR more complicated to make the printer correct seems like it's optimizing for the wrong thing. Inserting extra groups in places just obfuscates HIR values that already have clear semantics. That is, use concat(a, alt(b, c)) over concat(a, group(alt(b, c))). 4) It's not clear how we would change the HIR to guarantee this sort of thing wouldn't happen. At the very least, it seems likely it would require a more complex data type. At first, I had thought (1) seemed inelegant. But the more I thought about it, the more it seemed quite consistent with how the HIR printer already worked. So that's the path I took here. Closes #516, Closes #731 --- regex-syntax/src/hir/print.rs | 49 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 9d7b2f70e..f905f78fc 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -89,10 +89,9 @@ impl Visitor for Writer { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - HirKind::Empty - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + // Empty is represented by nothing in the concrete syntax, and + // repetition operators are strictly suffix oriented. + HirKind::Empty | HirKind::Repetition(_) => {} HirKind::Literal(hir::Literal::Unicode(c)) => { self.write_literal_char(c)?; } @@ -162,6 +161,22 @@ impl Visitor for Writer { self.wtr.write_str("(?:")?; } }, + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } } Ok(()) } @@ -172,9 +187,7 @@ impl Visitor for Writer { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Look(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + | HirKind::Look(_) => {} HirKind::Repetition(ref x) => { match (x.min, x.max) { (0, Some(1)) => { @@ -206,8 +219,10 @@ impl Visitor for Writer { self.wtr.write_str("?")?; } } - HirKind::Group(_) => { - self.wtr.write_str(")")?; + HirKind::Group(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; } } Ok(()) @@ -374,12 +389,12 @@ mod tests { #[test] fn print_alternation() { - roundtrip("|", "|"); - roundtrip("||", "||"); + roundtrip("|", "(?:|)"); + roundtrip("||", "(?:||)"); - roundtrip("a|b", "a|b"); - roundtrip("a|b|c", "a|b|c"); - roundtrip("foo|bar|quux", "foo|bar|quux"); + roundtrip("a|b", "(?:a|b)"); + roundtrip("a|b|c", "(?:a|b|c)"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); } // This is a regression test that stresses a peculiarity of how the HIR @@ -415,7 +430,7 @@ mod tests { }), Hir::literal(hir::Literal::Unicode('y')), ]); - assert_eq!(r"x(?:ab)+y", expr.to_string()); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); } // Just like regression_repetition_concat, but with the repetition using @@ -437,7 +452,7 @@ mod tests { }), Hir::literal(hir::Literal::Unicode('y')), ]); - assert_eq!(r"x(?:a|b)+y", expr.to_string()); + assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string()); } // This regression test is very similar in flavor to @@ -460,6 +475,6 @@ mod tests { Hir::literal(hir::Literal::Unicode('c')), ]), ]); - assert_eq!(r"a(?:b|c)", expr.to_string()); + assert_eq!(r"(?:a(?:b|c))", expr.to_string()); } }