regex: expose lower level search APIs

This commit exposes two new areas of API surface: 1. A new `captures_read` method which provides a way to access the offsets of submatches while amortizing the allocation of the space required to store those offsets. Callers should still of course prefer to use the higher level `captures` method, but if performance dictates, this lower level API may be useful. 2. New "at" variants of shortest_match/is_match/find/captures/captures_read that permit controlling where the start of a search begins within a slice. This is typically useful for controlling the match semantics of look-around operators such as `^` and `$`, and are necessary for implementing non-overlapping iterators. Fixes #219
rust-lang · Jun 25, 2018 · 4c59b70 · 4c59b70
1 parent 77140e7
commit 4c59b70
Show file tree

Hide file tree

Showing 7 changed files with 243 additions and 49 deletions.
diff --git a/ci/script.sh b/ci/script.sh
@@ -30,9 +30,7 @@ cargo test --verbose --manifest-path regex-syntax/Cargo.toml
 cargo doc --verbose --manifest-path regex-syntax/Cargo.toml
 
 # Run tests on regex-capi crate.
-cargo build --verbose --manifest-path regex-capi/Cargo.toml
-(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
-(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
+ci/test-regex-capi
 
 # Make sure benchmarks compile. Don't run them though because they take a
 # very long time. Also, check that we can build the regex-debug tool.

diff --git a/ci/test-regex-capi b/ci/test-regex-capi
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+set -e
+
+cargo build --verbose --manifest-path regex-capi/Cargo.toml
+(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
+(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
diff --git a/src/exec.rs b/src/exec.rs
@@ -29,7 +29,7 @@ use prog::Program;
 use re_builder::RegexOptions;
 use re_bytes;
 use re_set;
-use re_trait::{RegularExpression, Slot, Locations, as_slots};
+use re_trait::{RegularExpression, Slot, Locations};
 use re_unicode;
 use utf8::next_utf8;
 
@@ -359,13 +359,13 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
     }
 
     #[inline(always)] // reduces constant overhead
-    fn read_captures_at(
+    fn captures_read_at(
         &self,
         locs: &mut Locations,
         text: &str,
         start: usize,
     ) -> Option<(usize, usize)> {
-        self.0.read_captures_at(locs, text.as_bytes(), start)
+        self.0.captures_read_at(locs, text.as_bytes(), start)
     }
 }
 
@@ -528,13 +528,13 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
     ///
     /// Note that the first two slots always correspond to the start and end
     /// locations of the overall match.
-    fn read_captures_at(
+    fn captures_read_at(
         &self,
         locs: &mut Locations,
         text: &[u8],
         start: usize,
     ) -> Option<(usize, usize)> {
-        let slots = as_slots(locs);
+        let slots = locs.as_slots();
         for slot in slots.iter_mut() {
             *slot = None;
         }

diff --git a/src/lib.rs b/src/lib.rs
@@ -542,11 +542,11 @@ pub use re_builder::set_unicode::*;
 #[cfg(feature = "use_std")]
 pub use re_set::unicode::*;
 #[cfg(feature = "use_std")]
-pub use re_trait::Locations;
 #[cfg(feature = "use_std")]
 pub use re_unicode::{
     Regex, Match, Captures,
     CaptureNames, Matches, CaptureMatches, SubCaptureMatches,
+    CaptureLocations, Locations,
     Replacer, ReplacerRef, NoExpand, Split, SplitN,
     escape,
 };
@@ -644,7 +644,6 @@ pub mod bytes {
     pub use re_builder::set_bytes::*;
     pub use re_bytes::*;
     pub use re_set::bytes::*;
-    pub use re_trait::Locations;
 }
 
 mod backtrack;

diff --git a/src/re_bytes.rs b/src/re_bytes.rs
@@ -21,7 +21,7 @@ use exec::{Exec, ExecNoSync};
 use expand::expand_bytes;
 use error::Error;
 use re_builder::bytes::RegexBuilder;
-use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
+use re_trait::{self, RegularExpression, SubCapturesPosIter};
 
 /// Match represents a single match of a regex in a haystack.
 ///
@@ -252,10 +252,10 @@ impl Regex {
     /// The `0`th capture group is always unnamed, so it must always be
     /// accessed with `get(0)` or `[0]`.
     pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
-        let mut locs = self.locations();
-        self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
+        let mut locs = self.capture_locations();
+        self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
             text: text,
-            locs: locs,
+            locs: locs.0,
             named_groups: self.0.capture_name_idx().clone(),
         })
     }
@@ -568,7 +568,6 @@ impl Regex {
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
-    #[doc(hidden)]
     pub fn shortest_match_at(
         &self,
         text: &[u8],
@@ -583,7 +582,6 @@ impl Regex {
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
-    #[doc(hidden)]
     pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
         self.shortest_match_at(text, start).is_some()
     }
@@ -594,7 +592,6 @@ impl Regex {
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
-    #[doc(hidden)]
     pub fn find_at<'t>(
         &self,
         text: &'t [u8],
@@ -604,21 +601,55 @@ impl Regex {
             .map(|(s, e)| Match::new(text, s, e))
     }
 
-    /// Returns the same as captures, but starts the search at the given
+    /// This is like `captures`, but uses
+    /// [`CaptureLocations`](struct.CaptureLocations.html)
+    /// instead of
+    /// [`Captures`](struct.Captures.html) in order to amortize allocations.
+    ///
+    /// To create a `CaptureLocations` value, use the
+    /// `Regex::capture_locations` method.
+    ///
+    /// This returns the overall match if this was successful, which is always
+    /// equivalence to the `0`th capture group.
+    pub fn captures_read<'t>(
+        &self,
+        locs: &mut CaptureLocations,
+        text: &'t [u8],
+    ) -> Option<Match<'t>> {
+        self.captures_read_at(locs, text, 0)
+    }
+
+    /// Returns the same as `captures_read`, but starts the search at the given
     /// offset and populates the capture locations given.
     ///
     /// The significance of the starting point is that it takes the surrounding
     /// context into consideration. For example, the `\A` anchor can only
     /// match when `start == 0`.
+    pub fn captures_read_at<'t>(
+        &self,
+        locs: &mut CaptureLocations,
+        text: &'t [u8],
+        start: usize,
+    ) -> Option<Match<'t>> {
+        self.0
+            .searcher()
+            .captures_read_at(&mut locs.0, text, start)
+            .map(|(s, e)| Match::new(text, s, e))
+    }
+
+    /// An undocumented alias for `captures_read_at`.
+    ///
+    /// The `regex-capi` crate previously used this routine, so to avoid
+    /// breaking that crate, we continue to provide the name as an undocumented
+    /// alias.
     #[doc(hidden)]
     pub fn read_captures_at<'t>(
         &self,
-        locs: &mut Locations,
+        locs: &mut CaptureLocations,
         text: &'t [u8],
         start: usize,
     ) -> Option<Match<'t>> {
-        self.0.searcher().read_captures_at(locs, text, start)
-            .map(|(s, e)| Match::new(text, s, e))
+        self.captures_read_at(locs, text, start)
     }
 }
 
@@ -639,11 +670,19 @@ impl Regex {
         self.0.capture_names().len()
     }
 
-    /// Returns an empty set of locations that can be reused in multiple calls
-    /// to `read_captures`.
+    /// Returns an empty set of capture locations that can be reused in
+    /// multiple calls to `captures_read` or `captures_read_at`.
+    pub fn capture_locations(&self) -> CaptureLocations {
+        CaptureLocations(self.0.searcher().locations())
+    }
+
+    /// An alias for `capture_locations` to preserve backward compatibility.
+    ///
+    /// The `regex-capi` crate uses this method, so to avoid breaking that
+    /// crate, we continue to export it as an undocumented API.
     #[doc(hidden)]
-    pub fn locations(&self) -> Locations {
-        self.0.searcher().locations()
+    pub fn locations(&self) -> CaptureLocations {
+        CaptureLocations(self.0.searcher().locations())
     }
 }
 
@@ -769,6 +808,63 @@ impl<'r> Iterator for CaptureNames<'r> {
     }
 }
 
+/// CaptureLocations is a low level representation of the raw offsets of each
+/// submatch.
+///
+/// You can think of this as a lower level
+/// [`Captures`](struct.Captures.html), where this type does not support
+/// named capturing groups directly and it does not borrow the text that these
+/// offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs
+/// such as `read_captures`, which permits amortizing the allocation in which
+/// capture match locations are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// `capture_locations` method on the `Regex` being used to execute the search.
+/// The value returned can then be reused in subsequent searches.
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(re_trait::Locations);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+    /// Returns the start and end positions of the Nth capture group. Returns
+    /// `None` if `i` is not a valid capture group or if the capture group did
+    /// not match anything. The positions returned are *always* byte indices
+    /// with respect to the original string matched.
+    #[inline]
+    pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+        self.0.pos(i)
+    }
+
+    /// Returns the total number of capturing groups.
+    ///
+    /// This is always at least `1` since every regex has at least `1`
+    /// capturing group that corresponds to the entire match.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// An alias for the `get` method for backwards compatibility.
+    ///
+    /// Previously, we exported `get` as `pos` in an undocumented API. To
+    /// prevent breaking that code (e.g., in `regex-capi`), we continue
+    /// re-exporting the same undocumented API.
+    #[doc(hidden)]
+    #[inline]
+    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+        self.get(i)
+    }
+}
+
 /// Captures represents a group of captured byte strings for a single match.
 ///
 /// The 0th capture always corresponds to the entire match. Each subsequent
@@ -782,7 +878,7 @@ impl<'r> Iterator for CaptureNames<'r> {
 /// `'t` is the lifetime of the matched text.
 pub struct Captures<'t> {
     text: &'t [u8],
-    locs: Locations,
+    locs: re_trait::Locations,
     named_groups: Arc<HashMap<String, usize>>,
 }
 

diff --git a/src/re_trait.rs b/src/re_trait.rs
@@ -18,6 +18,7 @@ pub type Slot = Option<usize>;
 ///
 /// Unlike `Captures`, a `Locations` value only stores offsets.
 #[doc(hidden)]
+#[derive(Clone, Debug)]
 pub struct Locations(Vec<Slot>);
 
 impl Locations {
@@ -47,12 +48,11 @@ impl Locations {
     pub fn len(&self) -> usize {
         self.0.len() / 2
     }
-}
 
-/// This is a hack to make Locations -> &mut [Slot] be available internally
-/// without exposing it in the public API.
-pub fn as_slots(locs: &mut Locations) -> &mut [Slot] {
-    &mut locs.0
+    /// Return the individual slots as a slice.
+    pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
+        &mut self.0
+    }
 }
 
 /// An iterator over capture group positions for a particular match of a
@@ -139,7 +139,7 @@ pub trait RegularExpression: Sized {
 
     /// Returns the leftmost-first match location if one exists, and also
     /// fills in any matching capture slot locations.
-    fn read_captures_at(
+    fn captures_read_at(
         &self,
         locs: &mut Locations,
         text: &Self::Text,
@@ -246,7 +246,7 @@ impl<'t, R> Iterator for CaptureMatches<'t, R>
             return None
         }
         let mut locs = self.0.re.locations();
-        let (s, e) = match self.0.re.read_captures_at(
+        let (s, e) = match self.0.re.captures_read_at(
             &mut locs,
             self.0.text,
             self.0.last_end,