From 42dbdcfb03ada0511c4b63bcdb2bdbd9a7c2a9b3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 3 Mar 2023 12:29:44 -0500 Subject: [PATCH] syntax: allow Unicode in capture names This changes the rules for capture names to be much less restrictive. Namely, the requirements are now: 1. Must begin with an `_` or any alphabetic codepoint. 2. After the first codepoint, the name may contain any sequence of alpha-numeric codepoints along with `_`, `.`, `[` and `]`. Closes #595 --- regex-syntax/src/ast/parse.rs | 113 ++++++++++++++++++++++++++++++++-- src/expand.rs | 10 ++- src/lib.rs | 10 ++- 3 files changed, 125 insertions(+), 8 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 901250f61..533766a86 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -109,11 +109,11 @@ fn is_hex(c: char) -> bool { /// If `first` is true, then `c` is treated as the first character in the /// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { - c == '_' - || (!first - && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) - || ('A' <= c && c <= 'Z') - || ('a' <= c && c <= 'z') + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } } /// A builder for a regular expression parser. @@ -3910,6 +3910,55 @@ bar })) ); + assert_eq!( + parser("(?P)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 7), + ), + name: s("a¾"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(8, 1, 8), + Position::new(8, 1, 8), + ))), + })) + ); + assert_eq!( + parser("(?P<名字>)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(12, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(10, 1, 7), + ), + name: s("名字"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(11, 1, 8), + Position::new(11, 1, 8), + ))), + })) + ); + assert_eq!( parser("(?P<").parse().unwrap_err(), TestError { @@ -3968,6 +4017,60 @@ bar }, } ); + assert_eq!( + parser("(?P<5>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<5a>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾a>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<☃>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); } #[test] diff --git a/src/expand.rs b/src/expand.rs index 67b514926..98fafc949 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { }) } -/// Returns true if and only if the given byte is allowed in a capture name. +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, @@ -236,4 +237,11 @@ mod tests { find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); } diff --git a/src/lib.rs b/src/lib.rs index 1de347861..042d243f8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -360,13 +360,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
-(?<name>exp)   named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
 
+Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets