syntax: permit empty character classes

An empty character class is effectively a way to write something that can never match anything. The regex crate has pretty much always returned an error for such things because it was never taught how to handle "always fail" states. Partly because I just didn't think about it when initially writing the regex engines and partly because it isn't often useful. With that said, it should be supported for completeness and because there is no real reason to not support it. Moreover, it can be useful in certain contexts where regexes are generated and you want to insert an expression that can never match. It's somewhat contrived, but it happens when the interface is a regex pattern. Previously, the ban on empty character classes was implemented in the regex-syntax crate. But with the rewrite in #656 getting closer and closer to landing, it's now time to relax this restriction. However, we do keep the overall restriction in the 'regex' API by returning an error in the NFA compiler. Once #656 is done, the new regex engines will permit this case.
rust-lang · Apr 17, 2023 · ae5065b · ae5065b
1 parent 85d888f
commit ae5065b
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 84 deletions.
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -78,21 +78,8 @@ pub enum ErrorKind {
     /// available, and the regular expression required Unicode aware case
     /// insensitivity.
     UnicodeCaseUnavailable,
-    /// This occurs when the translator attempts to construct a character class
-    /// that is empty.
-    ///
-    /// Note that this restriction in the translator may be removed in the
-    /// future.
-    EmptyClassNotAllowed,
 }
 
-// BREADCRUMBS:
-//
-// Remove EmptyClassNotAllowed
-// Make errors non_exhaustive
-// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
-// Get rid of deprecated things
-
 impl std::error::Error for Error {}
 
 impl fmt::Display for Error {
@@ -118,7 +105,6 @@ impl fmt::Display for ErrorKind {
                 "Unicode-aware case insensitivity matching is not available \
                  (make sure the unicode-case feature is enabled)"
             }
-            EmptyClassNotAllowed => "empty character classes are not allowed",
         };
         f.write_str(msg)
     }

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -322,12 +322,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                         ast.negated,
                         &mut cls,
                     )?;
-                    if cls.ranges().is_empty() {
-                        return Err(self.error(
-                            ast.span,
-                            ErrorKind::EmptyClassNotAllowed,
-                        ));
-                    }
                     let expr = Hir::class(hir::Class::Unicode(cls));
                     self.push(HirFrame::Expr(expr));
                 } else {
@@ -337,13 +331,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                         ast.negated,
                         &mut cls,
                     )?;
-                    if cls.ranges().is_empty() {
-                        return Err(self.error(
-                            ast.span,
-                            ErrorKind::EmptyClassNotAllowed,
-                        ));
-                    }
-
                     let expr = Hir::class(hir::Class::Bytes(cls));
                     self.push(HirFrame::Expr(expr));
                 }
@@ -839,11 +826,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                 ast_class.negated,
                 class,
             )?;
-            if class.ranges().is_empty() {
-                let err = self
-                    .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
-                return Err(err);
-            }
         }
         result
     }
@@ -2357,16 +2339,7 @@ mod tests {
     #[test]
     #[cfg(feature = "unicode-gencat")]
     fn class_unicode_any_empty() {
-        assert_eq!(
-            t_err(r"\P{any}"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(7, 1, 8)
-                ),
-            }
-        );
+        assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
     }
 
     #[test]
@@ -2518,27 +2491,9 @@ mod tests {
             }
         );
         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
-        assert_eq!(
-            t_err(r"[^\s\S]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(7, 1, 8)
-                ),
-            }
-        );
+        assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
-        assert_eq!(
-            t_err(r"(?-u)[^\s\S]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(12, 1, 13)
-                ),
-            }
-        );
+        assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
     }
 
     #[test]
@@ -2686,27 +2641,9 @@ mod tests {
             hir_uclass(&[('C', 'C'), ('c', 'c')])
         );
 
-        assert_eq!(
-            t_err(r"[^a-c[^c]]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(10, 1, 11)
-                ),
-            }
-        );
+        assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
         #[cfg(feature = "unicode-case")]
-        assert_eq!(
-            t_err(r"(?i)[^a-c[^c]]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(4, 1, 5),
-                    Position::new(14, 1, 15)
-                ),
-            }
-        );
+        assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
     }
 
     #[test]

diff --git a/src/compile.rs b/src/compile.rs
@@ -457,7 +457,11 @@ impl Compiler {
     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
         use std::mem::size_of;
 
-        assert!(!ranges.is_empty());
+        if ranges.is_empty() {
+            return Err(Error::Syntax(
+                "empty character classes are not allowed".to_string(),
+            ));
+        }
         if self.compiled.uses_bytes() {
             Ok(Some(CompileClass { c: self, ranges }.compile()?))
         } else {
@@ -482,7 +486,11 @@ impl Compiler {
         &mut self,
         ranges: &[hir::ClassBytesRange],
     ) -> ResultOrEmpty {
-        debug_assert!(!ranges.is_empty());
+        if ranges.is_empty() {
+            return Err(Error::Syntax(
+                "empty character classes are not allowed".to_string(),
+            ));
+        }
 
         let first_split_entry = self.insts.len();
         let mut holes = vec![];