-
Notifications
You must be signed in to change notification settings - Fork 440
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Major refactoring and performance improvements.
Overview of changes: * Instruction set has been redesigned to be smaller, mostly by collapsing empty-width matches into one instruction type. In addition to moving instruction-matching out of the matching engine, this makes matching engine code much simpler. * Rewrote input handling to use an inline representation of `Option<char>` and clearer position handling with the `Input` trait. * Added a new bounded backtracking matching engine that is invoked for small regexes/inputs. It's about twice as fast as the full NFA matching engine. * Implemented caching for both the NFA and backtracking engines. This avoids costly allocations on subsequent uses of the regex. * Overhauled prefix handling at both discovery and matching. Namely, sets of prefix literals can now be extracted from regexes. Depending on what the prefixes look like, an Aho-Corasick DFA is built from them. (This adds a dependency on the `aho-corasick` crate.) * When appropriate, use `memchr` to jump around in the input when there is a single common byte prefix. (This adds a dependency on the `memchr` crate.) * Bring the `regex!` macro up to date. Unfortunately, it still implements the full NFA matching engine and doesn't yet have access to the new prefix DFA handling. Thus, its performance has gotten *worse* than the dynamic implementation in most cases. The docs have been updated to reflect this change. Surprisingly, all of this required exactly one new application of `unsafe`, which is isolated in the `memchr` crate. (Aho-Corasick has no `unsafe` either!) There should be *no* breaking changes in this commit. The only public facing change is the addition of a method to the `Replacer` trait, but it comes with a default implementation so that existing implementors won't break. (Its purpose is to serve as a hint as to whether or not replacement strings need to be expanded. This is crucial to speeding up simple replacements.) Closes #21.
- Loading branch information
1 parent
258c261
commit c86c025
Showing
26 changed files
with
2,616 additions
and
1,232 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
agggtaaa|tttaccct 0 | ||
[cgt]gggtaaa|tttaccc[acg] 3 | ||
a[act]ggtaaa|tttacc[agt]t 9 | ||
ag[act]gtaaa|tttac[agt]ct 8 | ||
agg[act]taaa|ttta[agt]cct 10 | ||
aggg[acg]aaa|ttt[cgt]ccct 3 | ||
agggt[cgt]aa|tt[acg]accct 4 | ||
agggta[cgt]a|t[acg]taccct 3 | ||
agggtaa[cgt]|[acg]ttaccct 5 | ||
|
||
101745 | ||
100000 | ||
133640 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// The Computer Language Benchmarks Game | ||
// http://benchmarksgame.alioth.debian.org/ | ||
// | ||
// contributed by the Rust Project Developers | ||
// contributed by TeXitoi | ||
// contributed by BurntSushi | ||
|
||
extern crate regex; | ||
|
||
use std::io::{self, Read}; | ||
use std::sync::Arc; | ||
use std::thread; | ||
|
||
macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } } | ||
|
||
fn main() { | ||
let mut seq = String::with_capacity(10 * (1 << 20)); | ||
io::stdin().read_to_string(&mut seq).unwrap(); | ||
let ilen = seq.len(); | ||
|
||
seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); | ||
let clen = seq.len(); | ||
let seq_arc = Arc::new(seq.clone()); | ||
|
||
let variants = vec![ | ||
regex!("agggtaaa|tttaccct"), | ||
regex!("[cgt]gggtaaa|tttaccc[acg]"), | ||
regex!("a[act]ggtaaa|tttacc[agt]t"), | ||
regex!("ag[act]gtaaa|tttac[agt]ct"), | ||
regex!("agg[act]taaa|ttta[agt]cct"), | ||
regex!("aggg[acg]aaa|ttt[cgt]ccct"), | ||
regex!("agggt[cgt]aa|tt[acg]accct"), | ||
regex!("agggta[cgt]a|t[acg]taccct"), | ||
regex!("agggtaa[cgt]|[acg]ttaccct"), | ||
]; | ||
let mut counts = vec![]; | ||
for variant in variants { | ||
let seq = seq_arc.clone(); | ||
let restr = variant.to_string(); | ||
let future = thread::spawn(move || variant.find_iter(&seq).count()); | ||
counts.push((restr, future)); | ||
} | ||
|
||
let substs = vec![ | ||
(regex!("B"), "(c|g|t)"), | ||
(regex!("D"), "(a|g|t)"), | ||
(regex!("H"), "(a|c|t)"), | ||
(regex!("K"), "(g|t)"), | ||
(regex!("M"), "(a|c)"), | ||
(regex!("N"), "(a|c|g|t)"), | ||
(regex!("R"), "(a|g)"), | ||
(regex!("S"), "(c|g)"), | ||
(regex!("V"), "(a|c|g)"), | ||
(regex!("W"), "(a|t)"), | ||
(regex!("Y"), "(c|t)"), | ||
]; | ||
let mut seq = seq; | ||
for (re, replacement) in substs.into_iter() { | ||
seq = re.replace_all(&seq, replacement); | ||
} | ||
let rlen = seq.len(); | ||
|
||
for (variant, count) in counts { | ||
println!("{} {}", variant, count.join().unwrap()); | ||
} | ||
println!("\n{}\n{}\n{}", ilen, clen, rlen); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.