Skip to content

Commit

Permalink
regex: expose lower level search APIs
Browse files Browse the repository at this point in the history
This commit exposes two new areas of API surface:

  1. A new `captures_read` method which provides a way to access the
     offsets of submatches while amortizing the allocation of the
     space required to store those offsets. Callers should still of
     course prefer to use the higher level `captures` method, but if
     performance dictates, this lower level API may be useful.
  2. New "at" variants of
     shortest_match/is_match/find/captures/captures_read that permit
     controlling where the start of a search begins within a slice.
     This is typically useful for controlling the match semantics of
     look-around operators such as `^` and `$`, and are necessary for
     implementing non-overlapping iterators.

Fixes #219
  • Loading branch information
BurntSushi committed Jun 25, 2018
1 parent 77140e7 commit 4c59b70
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 49 deletions.
4 changes: 1 addition & 3 deletions ci/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ cargo test --verbose --manifest-path regex-syntax/Cargo.toml
cargo doc --verbose --manifest-path regex-syntax/Cargo.toml

# Run tests on regex-capi crate.
cargo build --verbose --manifest-path regex-capi/Cargo.toml
(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
ci/test-regex-capi

# Make sure benchmarks compile. Don't run them though because they take a
# very long time. Also, check that we can build the regex-debug tool.
Expand Down
7 changes: 7 additions & 0 deletions ci/test-regex-capi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh

set -e

cargo build --verbose --manifest-path regex-capi/Cargo.toml
(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
10 changes: 5 additions & 5 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use prog::Program;
use re_builder::RegexOptions;
use re_bytes;
use re_set;
use re_trait::{RegularExpression, Slot, Locations, as_slots};
use re_trait::{RegularExpression, Slot, Locations};
use re_unicode;
use utf8::next_utf8;

Expand Down Expand Up @@ -359,13 +359,13 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
}

#[inline(always)] // reduces constant overhead
fn read_captures_at(
fn captures_read_at(
&self,
locs: &mut Locations,
text: &str,
start: usize,
) -> Option<(usize, usize)> {
self.0.read_captures_at(locs, text.as_bytes(), start)
self.0.captures_read_at(locs, text.as_bytes(), start)
}
}

Expand Down Expand Up @@ -528,13 +528,13 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
///
/// Note that the first two slots always correspond to the start and end
/// locations of the overall match.
fn read_captures_at(
fn captures_read_at(
&self,
locs: &mut Locations,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let slots = as_slots(locs);
let slots = locs.as_slots();
for slot in slots.iter_mut() {
*slot = None;
}
Expand Down
3 changes: 1 addition & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,11 +542,11 @@ pub use re_builder::set_unicode::*;
#[cfg(feature = "use_std")]
pub use re_set::unicode::*;
#[cfg(feature = "use_std")]
pub use re_trait::Locations;
#[cfg(feature = "use_std")]
pub use re_unicode::{
Regex, Match, Captures,
CaptureNames, Matches, CaptureMatches, SubCaptureMatches,
CaptureLocations, Locations,
Replacer, ReplacerRef, NoExpand, Split, SplitN,
escape,
};
Expand Down Expand Up @@ -644,7 +644,6 @@ pub mod bytes {
pub use re_builder::set_bytes::*;
pub use re_bytes::*;
pub use re_set::bytes::*;
pub use re_trait::Locations;
}

mod backtrack;
Expand Down
128 changes: 112 additions & 16 deletions src/re_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use exec::{Exec, ExecNoSync};
use expand::expand_bytes;
use error::Error;
use re_builder::bytes::RegexBuilder;
use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
use re_trait::{self, RegularExpression, SubCapturesPosIter};

/// Match represents a single match of a regex in a haystack.
///
Expand Down Expand Up @@ -252,10 +252,10 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
let mut locs = self.locations();
self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
let mut locs = self.capture_locations();
self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
text: text,
locs: locs,
locs: locs.0,
named_groups: self.0.capture_name_idx().clone(),
})
}
Expand Down Expand Up @@ -568,7 +568,6 @@ impl Regex {
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn shortest_match_at(
&self,
text: &[u8],
Expand All @@ -583,7 +582,6 @@ impl Regex {
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
self.shortest_match_at(text, start).is_some()
}
Expand All @@ -594,7 +592,6 @@ impl Regex {
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn find_at<'t>(
&self,
text: &'t [u8],
Expand All @@ -604,21 +601,55 @@ impl Regex {
.map(|(s, e)| Match::new(text, s, e))
}

/// Returns the same as captures, but starts the search at the given
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
/// [`Captures`](struct.Captures.html) in order to amortize allocations.
///
/// To create a `CaptureLocations` value, use the
/// `Regex::capture_locations` method.
///
/// This returns the overall match if this was successful, which is always
/// equivalence to the `0`th capture group.
pub fn captures_read<'t>(
&self,
locs: &mut CaptureLocations,
text: &'t [u8],
) -> Option<Match<'t>> {
self.captures_read_at(locs, text, 0)
}

/// Returns the same as `captures_read`, but starts the search at the given
/// offset and populates the capture locations given.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
pub fn captures_read_at<'t>(
&self,
locs: &mut CaptureLocations,
text: &'t [u8],
start: usize,
) -> Option<Match<'t>> {
self.0
.searcher()
.captures_read_at(&mut locs.0, text, start)
.map(|(s, e)| Match::new(text, s, e))
}

/// An undocumented alias for `captures_read_at`.
///
/// The `regex-capi` crate previously used this routine, so to avoid
/// breaking that crate, we continue to provide the name as an undocumented
/// alias.
#[doc(hidden)]
pub fn read_captures_at<'t>(
&self,
locs: &mut Locations,
locs: &mut CaptureLocations,
text: &'t [u8],
start: usize,
) -> Option<Match<'t>> {
self.0.searcher().read_captures_at(locs, text, start)
.map(|(s, e)| Match::new(text, s, e))
self.captures_read_at(locs, text, start)
}
}

Expand All @@ -639,11 +670,19 @@ impl Regex {
self.0.capture_names().len()
}

/// Returns an empty set of locations that can be reused in multiple calls
/// to `read_captures`.
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
CaptureLocations(self.0.searcher().locations())
}

/// An alias for `capture_locations` to preserve backward compatibility.
///
/// The `regex-capi` crate uses this method, so to avoid breaking that
/// crate, we continue to export it as an undocumented API.
#[doc(hidden)]
pub fn locations(&self) -> Locations {
self.0.searcher().locations()
pub fn locations(&self) -> CaptureLocations {
CaptureLocations(self.0.searcher().locations())
}
}

Expand Down Expand Up @@ -769,6 +808,63 @@ impl<'r> Iterator for CaptureNames<'r> {
}
}

/// CaptureLocations is a low level representation of the raw offsets of each
/// submatch.
///
/// You can think of this as a lower level
/// [`Captures`](struct.Captures.html), where this type does not support
/// named capturing groups directly and it does not borrow the text that these
/// offsets were matched on.
///
/// Primarily, this type is useful when using the lower level `Regex` APIs
/// such as `read_captures`, which permits amortizing the allocation in which
/// capture match locations are stored.
///
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
#[derive(Clone, Debug)]
pub struct CaptureLocations(re_trait::Locations);

/// A type alias for `CaptureLocations` for backwards compatibility.
///
/// Previously, we exported `CaptureLocations` as `Locations` in an
/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
/// we continue re-exporting the same undocumented API.
#[doc(hidden)]
pub type Locations = CaptureLocations;

impl CaptureLocations {
/// Returns the start and end positions of the Nth capture group. Returns
/// `None` if `i` is not a valid capture group or if the capture group did
/// not match anything. The positions returned are *always* byte indices
/// with respect to the original string matched.
#[inline]
pub fn get(&self, i: usize) -> Option<(usize, usize)> {
self.0.pos(i)
}

/// Returns the total number of capturing groups.
///
/// This is always at least `1` since every regex has at least `1`
/// capturing group that corresponds to the entire match.
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}

/// An alias for the `get` method for backwards compatibility.
///
/// Previously, we exported `get` as `pos` in an undocumented API. To
/// prevent breaking that code (e.g., in `regex-capi`), we continue
/// re-exporting the same undocumented API.
#[doc(hidden)]
#[inline]
pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
self.get(i)
}
}

/// Captures represents a group of captured byte strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
Expand All @@ -782,7 +878,7 @@ impl<'r> Iterator for CaptureNames<'r> {
/// `'t` is the lifetime of the matched text.
pub struct Captures<'t> {
text: &'t [u8],
locs: Locations,
locs: re_trait::Locations,
named_groups: Arc<HashMap<String, usize>>,
}

Expand Down
14 changes: 7 additions & 7 deletions src/re_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub type Slot = Option<usize>;
///
/// Unlike `Captures`, a `Locations` value only stores offsets.
#[doc(hidden)]
#[derive(Clone, Debug)]
pub struct Locations(Vec<Slot>);

impl Locations {
Expand Down Expand Up @@ -47,12 +48,11 @@ impl Locations {
pub fn len(&self) -> usize {
self.0.len() / 2
}
}

/// This is a hack to make Locations -> &mut [Slot] be available internally
/// without exposing it in the public API.
pub fn as_slots(locs: &mut Locations) -> &mut [Slot] {
&mut locs.0
/// Return the individual slots as a slice.
pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
&mut self.0
}
}

/// An iterator over capture group positions for a particular match of a
Expand Down Expand Up @@ -139,7 +139,7 @@ pub trait RegularExpression: Sized {

/// Returns the leftmost-first match location if one exists, and also
/// fills in any matching capture slot locations.
fn read_captures_at(
fn captures_read_at(
&self,
locs: &mut Locations,
text: &Self::Text,
Expand Down Expand Up @@ -246,7 +246,7 @@ impl<'t, R> Iterator for CaptureMatches<'t, R>
return None
}
let mut locs = self.0.re.locations();
let (s, e) = match self.0.re.read_captures_at(
let (s, e) = match self.0.re.captures_read_at(
&mut locs,
self.0.text,
self.0.last_end,
Expand Down
Loading

0 comments on commit 4c59b70

Please sign in to comment.