From 42dbdcfb03ada0511c4b63bcdb2bdbd9a7c2a9b3 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 3 Mar 2023 12:29:44 -0500
Subject: [PATCH] syntax: allow Unicode in capture names

This changes the rules for capture names to be much less restrictive.
Namely, the requirements are now:

1. Must begin with an `_` or any alphabetic codepoint.
2. After the first codepoint, the name may contain any sequence of
   alpha-numeric codepoints along with `_`, `.`, `[` and `]`.

Closes #595
---
 regex-syntax/src/ast/parse.rs | 113 ++++++++++++++++++++++++++++++++--
 src/expand.rs                 |  10 ++-
 src/lib.rs                    |  10 ++-
 3 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 901250f61..533766a86 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -109,11 +109,11 @@ fn is_hex(c: char) -> bool {
 /// If `first` is true, then `c` is treated as the first character in the
 /// group name (which must be alphabetic or underscore).
 fn is_capture_char(c: char, first: bool) -> bool {
-    c == '_'
-        || (!first
-            && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
-        || ('A' <= c && c <= 'Z')
-        || ('a' <= c && c <= 'z')
+    if first {
+        c == '_' || c.is_alphabetic()
+    } else {
+        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
+    }
 }
 
 /// A builder for a regular expression parser.
@@ -3910,6 +3910,55 @@ bar
             }))
         );
 
+        assert_eq!(
+            parser("(?P<a¾>)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: Span::new(
+                    Position::new(0, 1, 1),
+                    Position::new(9, 1, 9),
+                ),
+                kind: ast::GroupKind::CaptureName {
+                    starts_with_p: true,
+                    name: ast::CaptureName {
+                        span: Span::new(
+                            Position::new(4, 1, 5),
+                            Position::new(7, 1, 7),
+                        ),
+                        name: s("a¾"),
+                        index: 1,
+                    }
+                },
+                ast: Box::new(Ast::Empty(Span::new(
+                    Position::new(8, 1, 8),
+                    Position::new(8, 1, 8),
+                ))),
+            }))
+        );
+        assert_eq!(
+            parser("(?P<名字>)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: Span::new(
+                    Position::new(0, 1, 1),
+                    Position::new(12, 1, 9),
+                ),
+                kind: ast::GroupKind::CaptureName {
+                    starts_with_p: true,
+                    name: ast::CaptureName {
+                        span: Span::new(
+                            Position::new(4, 1, 5),
+                            Position::new(10, 1, 7),
+                        ),
+                        name: s("名字"),
+                        index: 1,
+                    }
+                },
+                ast: Box::new(Ast::Empty(Span::new(
+                    Position::new(11, 1, 8),
+                    Position::new(11, 1, 8),
+                ))),
+            }))
+        );
+
         assert_eq!(
             parser("(?P<").parse().unwrap_err(),
             TestError {
@@ -3968,6 +4017,60 @@ bar
                 },
             }
         );
+        assert_eq!(
+            parser("(?P<5>)").parse().unwrap_err(),
+            TestError {
+                span: span(4..5),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<5a>)").parse().unwrap_err(),
+            TestError {
+                span: span(4..5),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<¾>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(6, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<¾a>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(6, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<☃>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(7, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<a☃>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(8, 1, 7),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
     }
 
     #[test]
diff --git a/src/expand.rs b/src/expand.rs
index 67b514926..98fafc949 100644
--- a/src/expand.rs
+++ b/src/expand.rs
@@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
     })
 }
 
-/// Returns true if and only if the given byte is allowed in a capture name.
+/// Returns true if and only if the given byte is allowed in a capture name
+/// written in non-brace form.
 fn is_valid_cap_letter(b: u8) -> bool {
     match b {
         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
@@ -236,4 +237,11 @@ mod tests {
     find!(find_cap_ref17, "$x_$y", c!("x_", 3));
     find!(find_cap_ref18, "${#}", c!("#", 4));
     find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+    find!(find_cap_ref20, "${¾}", c!("¾", 5));
+    find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
+    find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
+    find!(find_cap_ref23, "${☃}", c!("☃", 6));
+    find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
+    find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
+    find!(find_cap_ref26, "${名字}", c!("名字", 9));
 }
diff --git a/src/lib.rs b/src/lib.rs
index 1de347861..042d243f8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -360,13 +360,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 
 <pre class="rust">
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
-(?&lt;name&gt;exp)   named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P&lt;name&gt;exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?&lt;name&gt;exp)   named (also numbered) capture group (names must be alpha-numeric)
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
 </pre>
 
+Capture group names must be any sequence of alpha-numeric Unicode codepoints,
+in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or
+an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic`
+Unicode property, while numeric codepoints correspond to the union of the
+`Decimal_Number`, `Letter_Number` and `Other_Number` general categories.
+
 Flags are each a single character. For example, `(?x)` sets the flag `x`
 and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
 the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets