From b85f15ff729ce62133dd72a55ba3a8d71d6161ad Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 5 Mar 2023 22:59:52 -0500 Subject: [PATCH] syntax: refactor and optimize case folding This rewrites how Unicode simple case folding worked. Instead of just defining a single function and expecting callers to deal with the fallout, we know define a stateful type that "knows" about the structure of the case folding table. For example, it now knows enough to avoid binary search lookups in most cases. All we really have to do is require that callers lookup codepoints in sequence, which is perfectly fine for our use case. Ref #893 --- regex-syntax/src/hir/interval.rs | 2 +- regex-syntax/src/hir/mod.rs | 16 +-- regex-syntax/src/hir/translate.rs | 5 +- regex-syntax/src/unicode.rs | 214 ++++++++++++++++-------------- 4 files changed, 125 insertions(+), 112 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 4efcf1e4b..e063390a8 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -131,7 +131,7 @@ impl IntervalSet { /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { - if other.ranges.is_empty() { + if other.ranges.is_empty() || self.ranges == other.ranges { return; } // This could almost certainly be done more efficiently. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 09eafcb5f..481682e6e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1232,23 +1232,13 @@ impl Interval for ClassUnicodeRange { &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { - if !unicode::contains_simple_case_mapping(self.start, self.end)? { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { return Ok(()); } let (start, end) = (u32::from(self.start), u32::from(self.end)); - let mut next_simple_cp = None; for cp in (start..=end).filter_map(char::from_u32) { - if next_simple_cp.map_or(false, |next| cp < next) { - continue; - } - let it = match unicode::simple_fold(cp)? { - Ok(it) => it, - Err(next) => { - next_simple_cp = next; - continue; - } - }; - for cp_folded in it { + for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 3df9d1f8d..b22861fc7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -824,8 +824,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. - let map = - unicode::contains_simple_case_mapping(c, c).map_err(|_| { + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index ed6d0948e..8f4602d86 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -64,75 +64,122 @@ impl core::fmt::Display for UnicodeWordError { } } -/// Return an iterator over the equivalence class of simple case mappings -/// for the given codepoint. The equivalence class does not include the -/// given codepoint. -/// -/// If the equivalence class is empty, then this returns the next scalar -/// value that has a non-empty equivalence class, if it exists. If no such -/// scalar value exists, then `None` is returned. The point of this behavior -/// is to permit callers to avoid calling `simple_fold` more than they need -/// to, since there is some cost to fetching the equivalence class. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn simple_fold( - c: char, -) -> Result, Option>, CaseFoldError> { - #[cfg(not(feature = "unicode-case"))] - fn imp( - _: char, - ) -> Result, Option>, CaseFoldError> - { - use core::option::IntoIter; - Err::, _>, _>(CaseFoldError(())) - } +/// A state oriented traverser of the simple case folding table. +/// +/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will +/// return an error if the underlying case folding table is unavailable. +/// +/// After construction, it is expected that callers will use +/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly +/// increasing order. For example, calling it on `b` and then on `a` is illegal +/// and will result in a panic. +/// +/// The main idea of this type is that it tries hard to make mapping lookups +/// fast by exploiting the structure of the underlying table, and the ordering +/// assumption enables this. +#[derive(Debug)] +pub struct SimpleCaseFolder { + /// The simple case fold table. It's a sorted association list, where the + /// keys are Unicode scalar values and the values are the corresponding + /// equivalence class (not including the key) of the "simple" case folded + /// Unicode scalar values. + table: &'static [(char, &'static [char])], + /// The last codepoint that was used for a lookup. + last: Option, + /// The index to the entry in `table` corresponding to the smallest key `k` + /// such that `k > k0`, where `k0` is the most recent key lookup. Note that + /// in particular, `k0` may not be in the table! + next: usize, +} - #[cfg(feature = "unicode-case")] - fn imp( - c: char, - ) -> Result, Option>, CaseFoldError> - { - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - - Ok(CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - })) +impl SimpleCaseFolder { + /// Create a new simple case folder, returning an error if the underlying + /// case folding table is unavailable. + pub fn new() -> Result { + #[cfg(not(feature = "unicode-case"))] + { + Err(CaseFoldError(())) + } + #[cfg(feature = "unicode-case")] + { + Ok(SimpleCaseFolder { + table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, + last: None, + next: 0, + }) + } } - imp(c) -} - -/// Returns true if and only if the given (inclusive) range contains at least -/// one Unicode scalar value that has a non-empty non-trivial simple case -/// mapping. -/// -/// This function panics if `end < start`. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn contains_simple_case_mapping( - start: char, - end: char, -) -> Result { - #[cfg(not(feature = "unicode-case"))] - fn imp(_: char, _: char) -> Result { - Err(CaseFoldError(())) + /// Return the equivalence class of case folded codepoints for the given + /// codepoint. The equivalence class returned never includes the codepoint + /// given. If the given codepoint has no case folded codepoints (i.e., + /// no entry in the underlying case folding table), then this returns an + /// empty slice. + /// + /// # Panics + /// + /// This panics when called with a `c` that is less than or equal to the + /// previous call. In other words, callers need to use this method with + /// strictly increasing values of `c`. + pub fn mapping(&mut self, c: char) -> &'static [char] { + if let Some(last) = self.last { + assert!( + last < c, + "got codepoint U+{:X} which occurs before \ + last codepoint U+{:X}", + u32::from(c), + u32::from(last), + ); + } + self.last = Some(c); + if self.next >= self.table.len() { + return &[]; + } + let (k, v) = self.table[self.next]; + if k == c { + self.next += 1; + return v; + } + match self.get(c) { + Err(i) => { + self.next = i; + &[] + } + Ok(i) => { + // Since we require lookups to proceed + // in order, anything we find should be + // after whatever we thought might be + // next. Otherwise, the caller is either + // going out of order or we would have + // found our next key at 'self.next'. + assert!(i > self.next); + self.next = i + 1; + self.table[i].1 + } + } } - #[cfg(feature = "unicode-case")] - fn imp(start: char, end: char) -> Result { + /// Returns true if and only if the given range overlaps with any region + /// of the underlying case folding table. That is, when true, there exists + /// at least one codepoint in the inclusive range `[start, end]` that has + /// a non-trivial equivalence class of case folded codepoints. Conversely, + /// when this returns false, all codepoints in the range `[start, end]` + /// correspond to the trivial equivalence class of case folded codepoints, + /// i.e., itself. + /// + /// This is useful to call before iterating over the codepoints in the + /// range and looking up the mapping for each. If you know none of the + /// mappings will return anything, then you might be able to skip doing it + /// altogether. + /// + /// # Panics + /// + /// This panics when `end < start`. + pub fn overlaps(&self, start: char, end: char) -> bool { use core::cmp::Ordering; - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - assert!(start <= end); - Ok(CASE_FOLDING_SIMPLE + self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal @@ -142,10 +189,15 @@ pub fn contains_simple_case_mapping( Ordering::Less } }) - .is_ok()) + .is_ok() } - imp(start, end) + /// Returns the index at which `c` occurs in the simple case fold table. If + /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < + /// c` and `table[i].0 > c`. + fn get(&self, c: char) -> Result { + self.table.binary_search_by_key(&c, |&(c1, _)| c1) + } } /// A query for finding a character class defined by Unicode. This supports @@ -892,20 +944,12 @@ mod tests { #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { - simple_fold(c).unwrap().unwrap() - } - - #[cfg(feature = "unicode-case")] - fn simple_fold_err(c: char) -> Option { - match simple_fold(c).unwrap() { - Ok(_) => unreachable!("simple_fold returned Ok iterator"), - Err(next) => next, - } + SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { - contains_simple_case_mapping(start, end).unwrap() + SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] @@ -931,26 +975,10 @@ mod tests { assert_eq!(xs, alloc::vec!['a']); } - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold_err('?')); - assert_eq!(Some('A'), simple_fold_err('@')); - assert_eq!(Some('a'), simple_fold_err('[')); - assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); - } - - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_max() { - assert_eq!(None, simple_fold_err('\u{10FFFE}')); - assert_eq!(None, simple_fold_err('\u{10FFFF}')); - } - #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { - assert!(simple_fold('a').is_err()); + assert!(SimpleCaseFolder::new().is_err()); } #[test] @@ -969,12 +997,6 @@ mod tests { assert!(!contains_case_map('☃', '☃')); } - #[test] - #[cfg(not(feature = "unicode-case"))] - fn range_contains_disabled() { - assert!(contains_simple_case_mapping('a', 'a').is_err()); - } - #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() {