syntax: fix ascii class union bug

This fixes a bug in how ASCII class unioning was implemented. Namely, it previously and erroneously unioned together two classes and then applied negation/case-folding based on the most recently added class, even if the class added previously wasn't negated. So for example, given the regex '[[:alnum:][:^ascii:]]', this would initialize the class with '[:alnum:]', then add all '[:^ascii:]' codepoints and then negate the entire thing because of the negation in '[:^ascii:]'. Negating the entire thing is clearly wrong and not the intended semantics. We fix this by applying negation/case-folding only to the class we're dealing with, and then we union it with whatever existing class we're building. Fixes #680
rust-lang · May 18, 2022 · 72f09f1 · 72f09f1
1 parent b537286
commit 72f09f1
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+TBD
+===
+
+* [BUG #680](https://github.com/rust-lang/regex/issues/680):
+  Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
+
+
 1.5.5 (2022-03-08)
 ==================
 This releases fixes a security bug in the regex compiler. This bug permits a

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
             }
             ast::ClassSetItem::Ascii(ref x) => {
                 if self.flags().unicode() {
+                    let xcls = self.hir_ascii_unicode_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
-                    for &(s, e) in ascii_class(&x.kind) {
-                        cls.push(hir::ClassUnicodeRange::new(s, e));
-                    }
-                    self.unicode_fold_and_negate(
-                        &x.span, x.negated, &mut cls,
-                    )?;
+                    cls.union(&xcls);
                     self.push(HirFrame::ClassUnicode(cls));
                 } else {
+                    let xcls = self.hir_ascii_byte_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
-                    for &(s, e) in ascii_class(&x.kind) {
-                        cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
-                    }
-                    self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?;
+                    cls.union(&xcls);
                     self.push(HirFrame::ClassBytes(cls));
                 }
             }
@@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         result
     }
 
+    fn hir_ascii_unicode_class(
+        &self,
+        ast: &ast::ClassAscii,
+    ) -> Result<hir::ClassUnicode> {
+        let mut cls = hir::ClassUnicode::new(
+            ascii_class(&ast.kind)
+                .iter()
+                .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+        );
+        self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+        Ok(cls)
+    }
+
+    fn hir_ascii_byte_class(
+        &self,
+        ast: &ast::ClassAscii,
+    ) -> Result<hir::ClassBytes> {
+        let mut cls = hir::ClassBytes::new(
+            ascii_class(&ast.kind)
+                .iter()
+                .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+        );
+        self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+        Ok(cls)
+    }
+
     fn hir_perl_unicode_class(
         &self,
         ast_class: &ast::ClassPerl,
@@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         class: &mut hir::ClassBytes,
     ) -> Result<()> {
         // Note that we must apply case folding before negation!
-        // Consider `(?i)[^x]`. If we applied negation field, then
+        // Consider `(?i)[^x]`. If we applied negation first, then
         // the result would be the character class that matched any
         // Unicode scalar value.
         if self.flags().case_insensitive() {
@@ -1943,6 +1963,25 @@ mod tests {
         );
     }
 
+    #[test]
+    fn class_ascii_multiple() {
+        // See: https://github.com/rust-lang/regex/issues/680
+        assert_eq!(
+            t("[[:alnum:][:^ascii:]]"),
+            hir_union(
+                hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+                hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+            ),
+        );
+        assert_eq!(
+            t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+            hir_union(
+                hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+                hir_bclass(&[(0x80, 0xFF)]),
+            ),
+        );
+    }
+
     #[test]
     #[cfg(feature = "unicode-perl")]
     fn class_perl() {