From 7084f14571b6d45240f6f6c80e6f2d40a3333dd3 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 27 Apr 2016 08:48:47 -0400
Subject: [PATCH] Add known upper limit to capture search.

The DFA will report the end location of a match, so we should pass that
along to capture detection. In theory, the DFA and the NFA report the
same match locations, so this upper bound shouldn't be necessary---the
NFA should quit once it finds the right match. It turns out though
bounding the text has two important ramifications:

1. It will enable the backtracking engine to be used more often. In
particular, the backtracking engine can only be used on small inputs and
this change decreases the size of the input by only considering the
match.
2. The backtracking engine must start every search by zeroing memory
that is proportional to the size of the input. If the input is smaller,
then this runs more quickly.

We are also careful to bound the match to one additional "character"
past the end of the match, so that lookahead operators work correctly.
---
 src/exec.rs | 25 ++++++++++++-------------
 src/utf8.rs | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/exec.rs b/src/exec.rs
index 46d885c22f..9d24423523 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -10,6 +10,7 @@
 
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::cmp;
 use std::sync::Arc;
 
 use thread_local::CachedThreadLocal;
@@ -27,6 +28,7 @@ use re_bytes;
 use re_trait::{RegularExpression, Slot};
 use re_unicode;
 use set;
+use utf8::next_utf8;
 
 /// Exec manages the execution of a regular expression.
 ///
@@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
     fn slots_len(&self) -> usize { self.0.slots_len() }
 
     fn next_after_empty(&self, text: &str, i: usize) -> usize {
-        let b = text.as_bytes()[i];
-        let inc = if b <= 0x7F {
-            1
-        } else if b <= 0b110_11111 {
-            2
-        } else if b <= 0b1110_1111 {
-            3
-        } else {
-            4
-        };
-        i + inc
+        next_utf8(text.as_bytes(), i)
     }
 
     #[inline(always)] // reduces constant overhead
@@ -439,9 +431,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
             }
             MatchType::Dfa => {
                 match self.find_dfa_forward(text, start) {
-                    dfa::Result::Match((s, _)) => {
+                    dfa::Result::Match((s, e)) => {
+                        // We need the +1 here to account for lookahead
+                        // operators.
+                        let e = if self.ro.nfa.uses_bytes() {
+                            cmp::min(e + 1, text.len())
+                        } else {
+                            cmp::min(next_utf8(text, e), text.len())
+                        };
                         self.captures_nfa(
-                            MatchNfaType::Auto, slots, text, s)
+                            MatchNfaType::Auto, slots, &text[..e], s)
                     }
                     dfa::Result::NoMatch => None,
                     dfa::Result::Quit => {
diff --git a/src/utf8.rs b/src/utf8.rs
index 648a05fd2f..cd5641ace9 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
 const TAG_THREE: u8 = 0b1110_0000;
 const TAG_FOUR: u8 = 0b1111_0000;
 
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+pub fn next_utf8(text: &[u8], i: usize) -> usize {
+    let b = match text.get(i) {
+        None => return i + 1,
+        Some(&b) => b,
+    };
+    let inc = if b <= 0x7F {
+        1
+    } else if b <= 0b110_11111 {
+        2
+    } else if b <= 0b1110_1111 {
+        3
+    } else {
+        4
+    };
+    i + inc
+}
+
 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
 ///
 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number