Skip to content

Commit

Permalink
Merge pull request #270 from BurntSushi/fixes
Browse files Browse the repository at this point in the history
fix a few bugs
  • Loading branch information
BurntSushi committed Aug 5, 2016
2 parents d95d021 + 16931b0 commit 11447f0
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 54 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ rust:
sudo: false
script:
- cargo build --verbose
- cargo build --verbose --manifest-path=regex-debug/Cargo.toml
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
cargo build --verbose --manifest-path=regex-debug/Cargo.toml;
RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern';
else
travis_wait cargo test --verbose;
Expand Down
30 changes: 30 additions & 0 deletions regex-syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,21 @@ impl Expr {
}
}

/// Returns true if and only if the expression has at least one matchable
/// sub-expression that must match the beginning of text.
pub fn has_anchored_start(&self) -> bool {
match *self {
Repeat { ref e, r, .. } => {
!r.matches_empty() && e.has_anchored_start()
}
Group { ref e, .. } => e.has_anchored_start(),
Concat(ref es) => es[0].has_anchored_start(),
Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
StartText => true,
_ => false,
}
}

/// Returns true if and only if the expression is required to match at the
/// end of the text.
pub fn is_anchored_end(&self) -> bool {
Expand All @@ -543,6 +558,21 @@ impl Expr {
}
}

/// Returns true if and only if the expression has at least one matchable
/// sub-expression that must match the beginning of text.
pub fn has_anchored_end(&self) -> bool {
match *self {
Repeat { ref e, r, .. } => {
!r.matches_empty() && e.has_anchored_end()
}
Group { ref e, .. } => e.has_anchored_end(),
Concat(ref es) => es[es.len() - 1].has_anchored_end(),
Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
EndText => true,
_ => false,
}
}

/// Returns true if and only if the expression contains sub-expressions
/// that can match arbitrary bytes.
pub fn has_bytes(&self) -> bool {
Expand Down
6 changes: 6 additions & 0 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ impl Compiler {
// matching engine itself.
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
self.compiled.is_anchored_start = expr.is_anchored_start();
self.compiled.has_anchored_start = expr.has_anchored_start();
self.compiled.is_anchored_end = expr.is_anchored_end();
self.compiled.has_anchored_end = expr.has_anchored_end();
if self.compiled.needs_dotstar() {
dotstar_patch = try!(self.c_dotstar());
self.compiled.start = dotstar_patch.entry;
Expand Down Expand Up @@ -171,6 +173,10 @@ impl Compiler {
exprs.iter().all(|e| e.is_anchored_start());
self.compiled.is_anchored_end =
exprs.iter().all(|e| e.is_anchored_end());
self.compiled.has_anchored_start =
exprs.iter().any(|e| e.has_anchored_start());
self.compiled.has_anchored_end =
exprs.iter().any(|e| e.has_anchored_end());
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
if self.compiled.needs_dotstar() {
dotstar_patch = try!(self.c_dotstar());
Expand Down
81 changes: 34 additions & 47 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ struct CacheInner {
/// The total number of times this cache has been flushed by the DFA
/// because of space constraints.
flush_count: u64,
/// The total heap size of the DFA's cache. We use this to determine when
/// we should flush the cache.
size: usize,
}

/// The transition table.
Expand Down Expand Up @@ -420,18 +423,32 @@ impl Cache {
pub fn new(prog: &Program) -> Self {
// We add 1 to account for the special EOF byte.
let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
Cache {
let starts = vec![STATE_UNKNOWN; 256];
let mut cache = Cache {
inner: CacheInner {
compiled: HashMap::new(),
trans: Transitions::new(num_byte_classes),
states: vec![],
start_states: vec![STATE_UNKNOWN; 256],
start_states: starts,
stack: vec![],
flush_count: 0,
size: 0,
},
qcur: SparseSet::new(prog.insts.len()),
qnext: SparseSet::new(prog.insts.len()),
}
};
cache.inner.reset_size();
cache
}
}

impl CacheInner {
/// Resets the cache size to account for fixed costs, such as the program
/// and stack sizes.
fn reset_size(&mut self) {
self.size =
(self.start_states.len() * mem::size_of::<StatePtr>())
+ (self.stack.len() * mem::size_of::<InstPtr>());
}
}

Expand Down Expand Up @@ -1151,7 +1168,9 @@ impl<'a> Fsm<'a> {
}
// If the cache has gotten too big, wipe it.
if self.approximate_size() > self.prog.dfa_size_limit {
println!("clearing cache (size: {:?})", self.approximate_size());
if !self.clear_cache_and_save(current_state) {
println!("giving up");
// Ooops. DFA is giving up.
return None;
}
Expand Down Expand Up @@ -1280,6 +1299,7 @@ impl<'a> Fsm<'a> {
} else {
None
};
self.cache.reset_size();
self.cache.trans.clear();
self.cache.states.clear();
self.cache.compiled.clear();
Expand Down Expand Up @@ -1454,6 +1474,11 @@ impl<'a> Fsm<'a> {
}
// Finally, put our actual state on to our heap of states and index it
// so we can find it later.
self.cache.size +=
self.cache.trans.state_heap_size()
+ (2 * state.data.len())
+ (2 * mem::size_of::<State>())
+ mem::size_of::<StatePtr>();
self.cache.states.push(state.clone());
self.cache.compiled.insert(state, si);
// Transition table and set of states and map should all be in sync.
Expand Down Expand Up @@ -1536,51 +1561,8 @@ impl<'a> Fsm<'a> {
/// be wiped. Namely, it is possible that for certain regexes on certain
/// inputs, a new state could be created for every byte of input. (This is
/// bad for memory use, so we bound it with a cache.)
///
/// The approximation is guaranteed to be done in constant time (and
/// indeed, this requirement is why it's approximate).
fn approximate_size(&self) -> usize {
use std::mem::size_of as size;
// Estimate that there are about 16 instructions per state consuming
// 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding).
const STATE_HEAP: usize = 20 + 1; // one extra byte for flags
let compiled =
(self.cache.compiled.len() * (size::<State>() + STATE_HEAP))
+ (self.cache.compiled.len() * size::<StatePtr>());
let states =
self.cache.states.len()
* (size::<State>()
+ STATE_HEAP
+ (self.num_byte_classes() * size::<StatePtr>()));
let start_states = self.cache.start_states.len() * size::<StatePtr>();
self.prog.approximate_size() + compiled + states + start_states
}

/// Returns the actual heap space of the DFA. This visits every state in
/// the DFA.
#[allow(dead_code)] // useful for debugging
fn actual_size(&self) -> usize {
let mut compiled = 0;
for k in self.cache.compiled.keys() {
compiled += mem::size_of::<State>();
compiled += mem::size_of::<StatePtr>();
compiled += k.data.len() * mem::size_of::<u8>();
}
let mut states = 0;
for s in &self.cache.states {
states += mem::size_of::<State>();
states += s.data.len() * mem::size_of::<u8>();
}
compiled
+ states
+ (self.cache.trans.num_states() *
mem::size_of::<StatePtr>() *
self.num_byte_classes())
+ (self.cache.start_states.len() * mem::size_of::<StatePtr>())
+ (self.cache.stack.len() * mem::size_of::<InstPtr>())
+ mem::size_of::<Fsm>()
+ mem::size_of::<CacheInner>()
+ self.prog.approximate_size() // OK, not actual, but close enough
self.cache.size + self.prog.approximate_size()
}
}

Expand Down Expand Up @@ -1628,6 +1610,11 @@ impl Transitions {
self.table[si as usize + cls]
}

/// The heap size, in bytes, of a single state in the transition table.
fn state_heap_size(&self) -> usize {
self.num_byte_classes * mem::size_of::<StatePtr>()
}

/// Like `next`, but uses unchecked access and is therefore unsafe.
unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
debug_assert!((si as usize) < self.table.len());
Expand Down
16 changes: 10 additions & 6 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -840,11 +840,7 @@ impl<'c> ExecNoSync<'c> {
) -> Option<(usize, usize)> {
// We can't use match_end directly, because we may need to examine
// one "character" after the end of a match for lookahead operators.
let e = if self.ro.nfa.uses_bytes() {
cmp::min(match_end + 1, text.len())
} else {
cmp::min(next_utf8(text, match_end), text.len())
};
let e = cmp::min(next_utf8(text, match_end), text.len());
self.captures_nfa(slots, &text[..e], match_start)
}

Expand Down Expand Up @@ -1113,7 +1109,15 @@ impl ExecReadOnly {
// If our set of prefixes is complete, then we can use it to find
// a match in lieu of a regex engine. This doesn't quite work well in
// the presence of multiple regexes, so only do it when there's one.
if self.res.len() == 1 {
//
// TODO(burntsushi): Also, don't try to match literals if the regex is
// partially anchored. We could technically do it, but we'd need to
// create two sets of literals: all of them and then the subset that
// aren't anchored. We would then only search for all of them when at
// the beginning of the input and use the subset in all other cases.
if self.res.len() == 1
&& !self.nfa.has_anchored_start
&& !self.nfa.has_anchored_end {
if self.nfa.prefixes.complete() {
return if self.nfa.is_anchored_start {
Literal(MatchLiteralType::AnchoredStart)
Expand Down
8 changes: 8 additions & 0 deletions src/prog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ pub struct Program {
pub is_anchored_start: bool,
/// Whether the regex must match at the end of the input.
pub is_anchored_end: bool,
/// Whether the regex has at least one matchable sub-expression that must
/// match from the start of the input.
pub has_anchored_start: bool,
/// Whether the regex has at least one matchable sub-expression that must
/// match at the end of the input.
pub has_anchored_end: bool,
/// Whether this program contains a Unicode word boundary instruction.
pub has_unicode_word_boundary: bool,
/// A possibly empty machine for very quickly matching prefix literals.
Expand Down Expand Up @@ -91,6 +97,8 @@ impl Program {
is_reverse: false,
is_anchored_start: false,
is_anchored_end: false,
has_anchored_start: false,
has_anchored_end: false,
has_unicode_word_boundary: false,
prefixes: LiteralSearcher::empty(),
dfa_size_limit: 2 * (1<<20),
Expand Down
7 changes: 7 additions & 0 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,10 @@ split!(split_on_word_boundary, r"\b", r"Should this (work?)",
t!(" ("), t!("work"), t!("?)")]);
matiter!(word_boundary_dfa, r"\b", "a b c",
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));

// See: https://github.com/rust-lang-nursery/regex/issues/268
matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1));

// See: https://github.com/rust-lang-nursery/regex/issues/264
mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0)));
mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0)));

0 comments on commit 11447f0

Please sign in to comment.