diff --git a/.travis.yml b/.travis.yml index f62594ea27..0e28eb4339 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ sudo: false script: - cargo build --verbose - cargo test --verbose + - ./run-shootout-test - | [ $TRAVIS_RUST_VERSION != nightly ] || ( cargo test --verbose --features pattern && diff --git a/Cargo.toml b/Cargo.toml index aaaeb98567..24dbf30803 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,15 @@ An implementation of regular expressions for Rust. [[test]] path = "regex_macros/tests/test_dynamic.rs" -name = "all" +name = "dynamic" + +[[test]] +path = "regex_macros/tests/test_dynamic_nfa.rs" +name = "dynamic_nfa" + +[[test]] +path = "regex_macros/tests/test_dynamic_backtrack.rs" +name = "dynamic_backtrack" [[bench]] name = "all" @@ -22,6 +30,8 @@ test = false bench = true [dependencies] +aho-corasick = "0.1" +memchr = "0.1" regex-syntax = { path = "regex-syntax", version = "0.1" } [dev-dependencies] diff --git a/README.md b/README.md index 98673988a2..1065b383f8 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ A Rust library for parsing, compiling, and executing regular expressions. [Documentation](http://doc.rust-lang.org/regex) + ## Usage Add this to your `Cargo.toml`: @@ -23,6 +24,7 @@ and this to your crate root: extern crate regex; ``` + # License `regex` is primarily distributed under the terms of both the MIT license and diff --git a/regex_macros/benches/regexdna-input.txt b/examples/regexdna-input.txt similarity index 100% rename from regex_macros/benches/regexdna-input.txt rename to examples/regexdna-input.txt diff --git a/examples/regexdna-output.txt b/examples/regexdna-output.txt new file mode 100644 index 0000000000..d36baa5be8 --- /dev/null +++ b/examples/regexdna-output.txt @@ -0,0 +1,13 @@ +agggtaaa|tttaccct 0 +[cgt]gggtaaa|tttaccc[acg] 3 +a[act]ggtaaa|tttacc[agt]t 9 +ag[act]gtaaa|tttac[agt]ct 8 +agg[act]taaa|ttta[agt]cct 10 +aggg[acg]aaa|ttt[cgt]ccct 3 +agggt[cgt]aa|tt[acg]accct 4 +agggta[cgt]a|t[acg]taccct 3 +agggtaa[cgt]|[acg]ttaccct 5 + +101745 +100000 +133640 diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs new file mode 100644 index 0000000000..304e27a54f --- /dev/null +++ b/examples/shootout-regex-dna.rs @@ -0,0 +1,67 @@ +// The Computer Language Benchmarks Game +// http://benchmarksgame.alioth.debian.org/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +extern crate regex; + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { ($re:expr) => { ::regex::Regex::new($re).unwrap() } } + +fn main() { + let mut seq = String::with_capacity(10 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, ""); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs.into_iter() { + seq = re.replace_all(&seq, replacement); + } + let rlen = seq.len(); + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, rlen); +} diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml index 49eb9c7bf3..9ad09cc7b3 100644 --- a/regex_macros/Cargo.toml +++ b/regex_macros/Cargo.toml @@ -17,10 +17,6 @@ plugin = true path = "tests/test_native.rs" name = "all" -[[test]] -path = "benches/shootout-regex-dna.rs" -name = "shootout_regex_dna" - [[bench]] name = "all" path = "benches/bench_native.rs" diff --git a/regex_macros/benches/bench.rs b/regex_macros/benches/bench.rs index 1a1e2379b1..9cb0c809a1 100644 --- a/regex_macros/benches/bench.rs +++ b/regex_macros/benches/bench.rs @@ -59,6 +59,13 @@ fn match_class_in_range(b: &mut Bencher) { bench_assert_match(b, re, &text); } +#[bench] +fn match_class_unicode(b: &mut Bencher) { + let re = regex!(r"\pL"); + let text = format!("{}a", repeat("☃5☃5").take(20).collect::()); + bench_assert_match(b, re, &text); +} + #[bench] fn replace_all(b: &mut Bencher) { let re = regex!("[cjrw]"); @@ -171,15 +178,19 @@ fn gen_text(n: usize) -> String { throughput!(easy0_32, easy0(), 32); throughput!(easy0_1K, easy0(), 1<<10); throughput!(easy0_32K, easy0(), 32<<10); +throughput!(easy0_1MB, easy0(), 1<<20); throughput!(easy1_32, easy1(), 32); throughput!(easy1_1K, easy1(), 1<<10); throughput!(easy1_32K, easy1(), 32<<10); +throughput!(easy1_1MB, easy1(), 1<<20); throughput!(medium_32, medium(), 32); throughput!(medium_1K, medium(), 1<<10); throughput!(medium_32K,medium(), 32<<10); +throughput!(medium_1MB, medium(), 1<<20); throughput!(hard_32, hard(), 32); throughput!(hard_1K, hard(), 1<<10); throughput!(hard_32K,hard(), 32<<10); +throughput!(hard_1MB, hard(), 1<<20); diff --git a/regex_macros/benches/random.txt b/regex_macros/benches/random.txt new file mode 100644 index 0000000000..dfae5cd142 --- /dev/null +++ b/regex_macros/benches/random.txt @@ -0,0 +1,513 @@ + +mnxnsynfvuugtbxsxbfxwreuspglnplefzwsp +tacfqcwnmodnmgnyiuvqoco +z + +qjuozfkexn +zoaxzncje +sldhqtmgxzyurfyzwazmmu +bbeuv +mzsrihycwcb +xzfqozfmlnpmrzpxxxytqs +xrg +mcplby +nmslhfgjowhzfxsvyddydnsyehdskbydbjksqtpet +indvfw +bvjvvw + +pddufodyqtyixbndtumndyz +xjjhtuvmsxhuwqulqtjhqrdqrmtbcphvyuqllocrnkpfv +zemshhz +wss +xewlrxfmgxnwgphcgefa +mbgsgbzrtthxweimcqzcaaheurdmd +osqefupespvh +z +tvvlakwzwjbrgjzfgubsmmonav +pjdskxcfgapsm +zqktqgkrcdrlskx +zwwfebhguskho +zlvvw +czwm +gojnpmboehlsazbexjjnuscqftrfufngygjdxcydib +d +afigycivicnknfxl +ljuwuopctiftfwctxecwipjnljyef +jonwbkodomzhqvlf +jdkizhognqsdogunwedjsmsdzho +zxvni +oynfjf +muvokjuqz +azuwrwtuxzfopwrcex +ixrjinlvxjmn +blaegnmbhsgsbmebwazaeguugtkowexgnqtbfkldadddv +tzabyoftyov +ctbtqbzscxzviuvcigwuwusrdro +ljynr +gnnnyyxslrhsbj +hhzlw +hijalf +rxlfqk +mhaofforwznvmcgplinludpgkucpa +gvvxsqqfmu +xxqhoyosixjfhjuxpv +faadjpvamjekreepizurntvwdynozfawsfawyms + +lcbutr +aqyxvpozkjrecrkl +lfmochahrr +ptqyomjlwo +vcmslulznx +lmlsskcihrmxauztuarydlp +beiqsrfnmvmlmybmwpektjbikvpggthpabqsgmjhnthvysuhwbigillugjsp +dfsuegseffwcsnvsrqedytblbpzbfeyfsq +kypvqctrkuds +ylqeduokzgdqaxelhftxnxbidu +bprzyayfopxdsmfhhfqowa +ymiutdtlfaaxpbtaeslv +ggago + +owpbicekdeykzfgcbgzobdvvrtetvcv +xsrlgingstiez +gyncqvq +xasohmeiwyscpehctmzmsnjklg +xsudghakxlw +dzqlfptjogzpkvwuticcyugnyopypuqqc +wlxshxbhdvuherumoppcc + +znyaptivzncvkpeyeipynqefjxjjcsgfqbnezeebtowdrbjaqjlbxwvyikrmxjwoxngqgvfpbniftnmszuxg +umwpwwyvufy +pallkjtnrmtauqxauewgygwkjjwebbkabhtxticxmxfujpxlrpzlrozfslkzfdsswlmmsbdgjwmjnummk +dhsxylejzityahtqqzmohrpzjprrsraztpnuagtyzfjdekthvdogfidksrdppr +ybc +fyukknoqfnkllkwflwempjijxgo +dltvlau +rhvrvlwsribfctuzodfqkdczfzxnetqqzflnhiyl +goxmcasmq +wljbhwkpahdotqhhrbhqzijv +lszewkgdmkezvgmbmllhpksdkoiwgkvqjmurshrptlctqsosuurndcuzjfwherotv +dudxxihygxblhgchbgzyzffb +eht +fvwxvqoltdcsd +rkuig +e +axhsacsmnicugul +rubtdlhjqndxdzzwfnkuzy +swxteuyxxsktkjgv +hzwwodlqaq +vxgecev +qnwla +vdxjuzpyoqhpmuunyffptopmeauhycs +dkzo +awrfzatzohslgvqlaezepmli +qgxatixvpkkhvkumbwmwcagtgyfljdok +amdnzstpvcqj +xsrvwvhjirzfgkessve +qezwbfltfbikbmoasvoflozsjhrljnszqiciuqmflrlqowwkoevuumh +babskcvavmtvsxqsewirucwzajjcfcqwsydydqo +ywfurpsl +edacsjjkjjewkxfoh +dcgkfpcjezurnuhiatrczcp +xsatnimwbcciu +grzmbrsvvcyigcbmcqfwiiknrohveubhyijxeyzfm +kqyewccgcqrrrznwxmoztlyseagbpyho +najju +nis +awgzdvfjkzlrsjcqfeacx +oisuflfigrjaex +desbdulyuwqxuxianyypybxwlql +ekmqgspvqpftpwswayh +egbyj +fznzprhvnnwcxgcc +wfdsueieosmugirxbymbpmfrspvrktjzguxm +qkjrufshwnfwwpbhukdjlaqvljlgubmqmhnha +hwqpudgnblhlxppbrmbznotteivuzguuwlhtkytky +w +yofkyzbpg +cenolnfnllkvhikrpttcxgqxmufvorekjruyjxmr + +hyexmpjijgzumawp +cdbevdilgopbzlo +fivelagckslkugdxprjxkylizewcptwxfhomzuituujixchadmnjoktnqa +csojvlinzmmkkfzqueamnuwkanzdzsavgohposbuoamoevehqrmcxdsuyelvvctoejzoertqormhaaxwofvjzekwt +sbkghhnhutrvwtyjaxndzyjamrhx +jjyqy +majwbnrhveuhrsbbbjrwpwuplifeseylqh +wyvutpxnkrnkuxxetjkkifpqb +dyzucmbcvgnjeecm +hz +uhnuipthxrzkqluosvk +lwqqzsdwiwvwaqfwlvubadlyizlo +jbd +oyzjeu +kydjkbsqxnbfiuesc +smeubjqrcxdvhsabzceyglqjzbfmoacmwvwjbhhxbr +uabipgecujfdfxpmdzrscdyvefizabgspqjrrkmgjt +xgvdgzryz +lw +uimob +ifhn +bqph +ole +g +wt +k +yslzrkwkundxfdibwqvucemepqxlmlpyngabbeciuzhptpjdetyngrtxrdtzmvq +ccwapidp + +bwvrgvmtshevrophy +ni +fdkplu +mdykey +i +rhsrenoetdggpjb +djmkplpeabsholx +judxtub +fooakqwvocvpcrvxqhvtmpvhkrecy +uuxscjillynilbkrgt +evtinrmilniguarqritpeipwochmdw +sxaqzjybydyvnmmjtdcgkjnqfcklbfpkdfyewgcukqoiegyfp +kg +ovrwieqhy +jcxqtkerzjwhs +xeonglszbgypafhmqcaseimzjgebkvigbqwsayrnrprtuvhsxyitfqygohgorcdnufbcyvevvgzmjrgjqqquwkszplogx +zdketqqv +yebckucwayckeezfvtnavglpjh +zorkfrwk +pad +xqaquxudybwtgixbfktinctfirjfdayh +rieknj +ebk +qzbcfywfdmhsdruhopovemafijbscagllkmhmof + +asbsnbddlobwoqatfhkbhhsymzqxjuixwreheugvngmgcuqpkjhhfwpbarqaxrwgwnjbanljlds +etevdvlc +lqyjrnmenhn +k +tsf +zczgeavcexh +jlpuxywtsrvnvluruqhecjca +ir +rikrgkmhwaosodkxgcnrexfmdrszhnmutpvwztg +bffjqovvkemctnsgeh +weysbhzixiipfithjfsk +usyzvaiyuhmksfluoirfbnsu +o +cgawpdakaszeafdtbdkqtlzkrpnoqomqvuaqcfmzgvfegovtfaonelpv +izmrcjlk +xmzemniyrzy +knqexaafsdlimdamcrprlshq +qkmqw +dntgjwsibclvposdwjuklvtejjjdjibgpyynqpgprvvaetshhmvfkcpb +otvazkrkklrxfotpopyjte +fghkcnpi +rulyaihsowvcgbzeiblhuhhfbmncqsuuqcxvseorn +exirzfmojnxcoqom +zsgpgtokun +zvamxfocorganbtlafifwdqmqtsnktbwwtewborq + +cxlnaspjqvsitjyzyriqsuorjsrvzqenisprttudxntsbqrpjtdkxnwcwgjyxmgtqljcrmrbrmyvosojzlumcmjcgfjsdehec +mvx +mt +mckr +teulvroifk +laaicc +koufy +bexmwsvyarnznebdfy +ripvviosbqijsxnjilwddaqaqemzsdarnxmfooxghoypizwtbueo +ljycycuqwfnzbambibqdixmkkvwtubepla +cis +kcg +vmbbiuuoamenzepuagpfujevfstqtndjxjchdvycfrrrowochtjdmkklgnhf +pmorrwguxkvdxpluatagaziin + +uwvzbmkmykjkmknzppklx +pnzxuvsrjunqxercsnvayhykcazdeclomdsasgkpqpiufyfqsxhj +yceizkddwojgweegcllaagpvrpo +ek +kuxxgbezqyxvfaxdwnqdgqsmneijunxzlwxkrs +ldldbrxmvtjlqxifngmactzqcygkvuteffcmvphevilabgukatqakamjlridznodcvblvlogulmcixxfimh +iuzjootuywjqklolzzhpeaynydjwtufjavbozxnzckuzdodkvkjfmhinelv +swlfkcufscfcovmghqwcrtxjukwafoeogrkgubbqgwzm +gjcylkwgzroubdssuqeykqjcmguso +fzq +srfvysoxtlylctp + +pbfeiuzwoyixews +ocvvunfsjnrtklmuuzjojw +xdjcnrpqhmpmpcwacpcdtmbsczvhllkqapzjuaf +nfnuvjz +fwnuiyqpn +wshxxxpzzxp +hibrxcfeqca + +wqhlllarl +bukcbojv +plrytapy +xm +vlgfqoyzdczqbbaxjwbjjevjhxgopuqvqcrj +vpjqfbdnsdxlbuuiqocvrhap +mgumjbvnnzgnrdru +gcgzugazxdcamrhczfzhtmdjj +uislwq +vooai +zjuqfmebuzsqngzekyajujkopvayxtdzvugwwucvlsbrnhitfotmhhmgddlzlvqrkcponictrfweuilfjiuoabkfdvpjiqjrrgi +aptjfhmrnxaq +hbs +w +mwmoxqvucwygunplzvxtxpk +fgmqmtlorfzytjdzffsosfccnfwugrsrynuej +rpmpenrhsxoefnblyumjqwvuyszyppnttuyvazjdug +zdzxraxkroknkmqgvuoqeqdtvclsvvuwmdwzfugcpteohlogxubyoebvrzbqzklvehfcqadtdrkpubfhmokzwyosogepwragcpwxo +ax +dz +de + +thvkdmnbdws + +ejmubw +umvwkaubzurf +wyxtxeluaoox +wwbioobtgmkebxo +miglgnafmdarzkeblyjctuayzyoeqnfnbtrcbymdzkzg +loavxq +kzhllgsenxlbgdbfzwbg +yxflogzsohlcycbyzegeubfflouvtuatixhjvicjegltjiy +jigqfjppafdiarc +mcnmwtachgearonfcymvjbrnljjxmlzkudvzqsarnfysmxlfrtlvjxwvpdbhvwysnvcdozfcruhjwnucdzakkilmlfgjiolcatpfusm + +n +pdjunfcz +dc +edxkkxabsbvmvifiinnoccki +bc +gwtwsvorwzfqpz +exidmexstfflkhi +s +s +c +wtcjfywlayhpbqktcepoybowtkrmnumqsg +ozclkgjdmdk +jmegtbunyexurvfexhqptnqzie +tkoenpagzwqfawlxvzaijsjqhmg +swodqfjpdqcbkc +ujokogocyaygdibgpglecis +shlmdmgonvpuaxlhrymkxtiytmv +brhk +jmsyiuomiywxhegilycjprkyfgojdo + +wzdzrgpdiosdsvkcw +odlnmsfnjrcsnflviwvawybpczdkzvdocpwrmavz +p +ubowamlskcqhdxuckrxa +fawhntiwhmdwkddnahmtajqqazpdygttqivhdiodkcpcwv +gmxujmmaufmbipaiulhurzkfdg +eixjhmbaeoybiwk +kumntgrgiofcmujlzbcopuobambsw +mnjkqiyb +iktwnsnv +hfuzcl +tqiyqvagbqgtowpjbedgjot +dfemvamelxadkztogliizdtsddoboafawficudlefo +raecmxiiibljryswntpfed +mbwrtsebkeegw +x +epp +he + +vnztrswhiusokqdkmsnpuswucvfhcthjbtam +baxlwidsgbdpzvnlj +tcbjjoadrzo +aiidahyllzzsg + +igebuubweicbssgddpmqxunrawavuglmpxrtkqsvjjtscibqiejjfgfnovokodmqcqitlteiakooupvzkwucucrfdzjvjbqbkgutoybmpfvhbutigdxhfiqfplyciz +cnrhbjdnjftwfwlwzrdkwhajgsizsi +qfntnt +okqyfnbresp +asyg +mjqdkdyggdxzwuzglays +h +ifaqcazoy +fol +vvsusbnugduxsceozmsarbp +epjwtorx +bwiuxxiyc +cw +bwogruhctwkfvbexjnwircykxyzjmats +kygiochfwlpsvmxcgmtjrgvfdptd +q +qmpqe + +z +jghffhqfoecmszunhxmzmzhlmbrvjabhrkihgjmvckhkfpaygjkg + +kfiyfgounmhlvhupswqdgws +ezzdpyqucqoocsdcjtruqpokldfkmjhqzoynirybsifyaxnaxppthjoqy +nwetlgzwrhkhtuubbkbepuhbllxspvagxrqokwnrhkbwdwtp +hlazomrhqogoaxypqaszwfxxmutvbpuuvpdffuqskcbzlwyzcssnflkwiydoveyxjnzllzhyozbsa +hwnitkwbxcyibbqsluuqywbk + +ozpfjsdrc +yoepefuy +lvmspzepnetra +genbrcrmuqfvkaouvuymoxhcxotjjhk +pcshyqgbmqdubsdajnyfqvxkqvywffzn +ukhcbyzwslqeq +otfrmcbnhbyffxqregqoufdxucjunwdhlqqeiiawbxlpqeyzzopfungrryqdykgizrhqodirvazm +dhpfhzyq +cloz +eduupqifolfekve +qiec +ishnjukvomntmdthlkajxpiwk +y +axl +tmyskjqkjsvumizlal +wvvolwewsfxhhdieuagdcuhwsgqvswpbkdkpxskloalmr +ryfmhe +z +mmbpgsyrfvzdatbjrjhuipwt +llzwizmmuulgwocowwmugtaoewkhnqxparvtynlffffdfcocdbba + +pyczkzbmcgrdnxnmezsx +gsqe +mcocxcolcynhpecstsn +opnpplkccobjuhtbhirpzfxuktmpsiwbvsgiaavvdge +wpaldxzasnrbvtugjwytvtfttrh +zxecurevkjiyxy +wtnovebcmglkktic +fdpwfgvlvovxrwh +bmwgdullzy +uzwhagxinwqifxjbcntqzqoxkmpqxhe +jrfizsnwxwnnhb +inapddlahrp + +ndtvkceobe +buskgghihdjmjlwfc +j +rkvffxwtmzoeruhlsurwtnuh +cbvkhfepkdishfpqvijzrpleuy +jzdpxjhcgqnybssfegvrnpgyehdqpgjwudbwrjbavp +xzzvgqdrdwajmdmj +vfatwsxvwfdbdhnijdujoyotwvwjipuuetichcfmvgrsnjpqaaezjtkvc +lbfoqgfshrtwgdqufwnfuitdrjydqctqixlzufkdbp +zgau +qefdpmtkecvtj +kuphldkvnzdtpd +dti +fpd +gfrliyegxsb +i +qsddsrmkyfgzrjeqnitmnypbcakh +vfbvbrpuogzhzrbmklvhji +nkz +xlufbaoblbmeub +alwuzxzmobwdukvwnkiwmuqhuxfhevogdnqtmxjptqznrk +cngpoty + +ms +qvenfg +dmeaffm +jycfgnanbmoamhmarkmjcagbp +ysqmbhopgx +jczbzgwedsp + +zxzwjrxcwdtleizjlvifjwgxiibezwxhtzywqdi +mtgnlu +xboxirdchurkfnklnpkapnqfxnhrxyseiujrznjm + +zm +atddskbghcahlhql +szshwzmmvu +befdtpouamwhiisyybispkchpjhownatawjfbx + +ennkzbrlygd +zbt +upphzpdwzmlhhhbqvjsfmbnrar +ddcs +ipbxgzyudjyongtcyygncojdufnufqpdppgvq +gc +isu +foa +wf +jdlvqxgfbowhohhyyngbcs +zjuwjyucdwblatsnywaaoftlcamfbcnw +lzrioesuhoeevczuwrnltmkahfwiu +uicggfbddqltnjyxfltbnaekncnyxsit +zkxsqkqrwrzrxgxbsgxatybfr + +ptvmfyxdcglbfipcguqthjygzqnpqssscukzawynidtchjrrxwuxifoe +w +ohu +vg +zagpowezvbniybgold +lhqseqcxteiqtgnpanpvrmvvlltxh +mtfnxn +wyodtg + +rawpbgtpbaktqzmmpzxmrlwpvvmdsl +widcfbirvswraukbmkhf +vplrueuxomjkqrtjgyxjdkexttzyozawyq +hrpbahllznvmjudzxpbbv +tlavfrxygjfipkupwnbacltcfepeg +icu +otxcu +aewazy +hl + +fmrp +qaacthwzohenzjr +xbyebba +rvkph +mkhhmh +swme +zjmdoypaktglcyzobquunvthcdwegtbywpijxd +jvkuhnxqc +gibhqgjojsxt +bodbktzomiqujtbstqiyquwvqgufphqstenxvddkvtdh +bpusrxkfi +zgp +pmxvgamydyakituvvsucsuidrlznupcsinltmrahulhepxmhoqtfvpjkxzhrrinncuh +jzgkjjhjqykzelaszvcwvvwbnzsxdeaerfnaravk +ynanrqyrxo +zsmuxofullob +brklgrcqefdyoczy +qkpls +snhqumae +iqdtzjadzzvnqvdvjfsaf +nfqfdqiramueblxkaqxbbkxwywzgdbndjjiqk +tc +kp +cpuckbjsxhtxmomfesgxdpz +oseif +ybhxbvyxrpkrexrhjzoaxxohrhsniewsrktjnaztn +ggelspdzhzbchruhbjbjidgjwdlhdycetqaswh +jkgivsngygkbqtlmoj +dwpnanfvitxg +ospxbwxp +wgvmvrnjescemdoiralbkvemalifxnyhrbdgodml +hjtsnkzknkplbzsiwmneefdkihnhsamjsrxggclyjqgpqltizi + + +sykgbuypwwhweab +nvdkkkskmtiwpoerkon +sx +sbyflwwiqylbskdlxesmylpaz +dnwcjenaluwesyywfaezznwkdwpoesxpu +kie +dslccwfryol +gfhomgfn +zprjtfqvkotktzidmoyrivall +bunvsqkysdelozemnjoeqfolruulpbipm +ullyzfahpkhkja +hwd +kvyqtprpuulgsk +zotbkcadnxmfvqmtlbxalhughceyfcibtzzj +vvpjbgxygl +hpic +mhrqd +dv +thehuzdbaacoidjoljbysnqwrrxxplrdznmgiukkvjqbopb +moszjt +rmtbunktkywqirveeqfa +kse +wbfflnatgzobjrxghjgvcsyxoruenxhyomutbptswjajawqjpqafpdcstkiyjuilimecgejpqmyciolgcmdpcstzdozbmnza diff --git a/regex_macros/benches/shootout-regex-dna.rs b/regex_macros/benches/shootout-regex-dna.rs deleted file mode 100644 index 3de583451f..0000000000 --- a/regex_macros/benches/shootout-regex-dna.rs +++ /dev/null @@ -1,136 +0,0 @@ -// The Computer Language Benchmarks Game -// http://benchmarksgame.alioth.debian.org/ -// -// contributed by the Rust Project Developers - -// Copyright (c) 2014 The Rust Project Developers -// -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// - Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in -// the documentation and/or other materials provided with the -// distribution. -// -// - Neither the name of "The Computer Language Benchmarks Game" nor -// the name of "The Computer Language Shootout Benchmarks" nor the -// names of its contributors may be used to endorse or promote -// products derived from this software without specific prior -// written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -// OF THE POSSIBILITY OF SUCH DAMAGE. - -#![feature(plugin, scoped)] -#![plugin(regex_macros)] - -extern crate regex; - -use std::io::{self, Read}; -use std::sync::Arc; -use std::thread; -use regex::NoExpand; - -#[test] -fn check() { - static ANSWER: &'static str = "\ -agggtaaa|tttaccct 0 -[cgt]gggtaaa|tttaccc[acg] 3 -a[act]ggtaaa|tttacc[agt]t 9 -ag[act]gtaaa|tttac[agt]ct 8 -agg[act]taaa|ttta[agt]cct 10 -aggg[acg]aaa|ttt[cgt]ccct 3 -agggt[cgt]aa|tt[acg]accct 4 -agggta[cgt]a|t[acg]taccct 3 -agggtaa[cgt]|[acg]ttaccct 5 - -101745 -100000 -133640"; - static SEQ: &'static str = include_str!("regexdna-input.txt"); - let got = run(SEQ.to_string()).connect("\n"); - assert_eq!(ANSWER, got); -} - -#[allow(dead_code)] -fn main() { - let mut input = String::new(); - io::stdin().read_to_string(&mut input).unwrap(); - println!("{}", run(input).connect("\n")); -} - -fn run(mut seq: String) -> Vec { - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, NoExpand("")); - let seq_arc = Arc::new(seq.clone()); // copy before it moves - let clen = seq.len(); - - let seqlen = thread::scoped(move|| { - let substs = vec![ - (regex!("B"), "(c|g|t)"), - (regex!("D"), "(a|g|t)"), - (regex!("H"), "(a|c|t)"), - (regex!("K"), "(g|t)"), - (regex!("M"), "(a|c)"), - (regex!("N"), "(a|c|g|t)"), - (regex!("R"), "(a|g)"), - (regex!("S"), "(c|g)"), - (regex!("V"), "(a|c|g)"), - (regex!("W"), "(a|t)"), - (regex!("Y"), "(c|t)"), - ]; - let mut seq = seq; - for (re, replacement) in substs.into_iter() { - seq = re.replace_all(&seq, NoExpand(replacement)); - } - seq.len() - }); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - let (mut variant_strs, mut counts) = (vec!(), vec!()); - for variant in variants.into_iter() { - let seq_arc_copy = seq_arc.clone(); - variant_strs.push(variant.to_string()); - counts.push(thread::scoped(move|| { - variant.find_iter(&seq_arc_copy).count() - })); - } - - let mut olines = Vec::new(); - for (variant, count) in variant_strs.iter().zip(counts.into_iter()) { - olines.push(format!("{} {}", variant, count.join())); - } - olines.push("".to_string()); - olines.push(format!("{}", ilen)); - olines.push(format!("{}", clen)); - olines.push(format!("{}", seqlen.join())); - olines -} diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index cd6c8d232d..83917dc438 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -21,8 +21,6 @@ extern crate regex; extern crate syntax; extern crate rustc; -use std::rc::Rc; - use syntax::ast; use syntax::codemap; use syntax::ext::build::AstBuilder; @@ -35,9 +33,8 @@ use syntax::ptr::P; use rustc::plugin::Registry; use regex::Regex; -use regex::native::{ - Inst, Program, Dynamic, ExDynamic, Native, - simple_case_fold, +use regex::internal::{ + Inst, LookInst, OneChar, CharRanges, Program, Dynamic, Native, }; /// For the `regex!` syntax extension. Do not use. @@ -61,11 +58,6 @@ pub fn plugin_registrar(reg: &mut Registry) { /// direct `match pc { ... }`. The generators can be found in /// `step_insts` and `add_insts`. /// -/// Other more minor changes include eliding code when possible (although this -/// isn't completely thorough at the moment), and translating character class -/// matching from using a binary search to a simple `match` expression (see -/// `match_class`). -/// /// It is strongly recommended to read the dynamic implementation in vm.rs /// first before trying to understand the code generator. The implementation /// strategy is identical and vm.rs has comments and will be easier to follow. @@ -86,7 +78,7 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) } }; let prog = match re { - Dynamic(ExDynamic { ref prog, .. }) => prog.clone(), + Dynamic(ref prog) => prog.clone(), Native(_) => unreachable!(), }; @@ -120,17 +112,12 @@ impl<'a> NfaGen<'a> { None => cx.expr_none(self.sp), } ); - let prefix_anchor = match self.prog.insts[1] { - Inst::StartText => true, - _ => false, - }; - let init_groups = self.vec_expr(0..num_cap_locs, - &mut |cx, _| cx.expr_none(self.sp)); + let prefix_anchor = self.prog.anchored_begin; - let prefix_lit = Rc::new(self.prog.prefix.as_bytes().to_vec()); - let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit)); + // let prefix_lit = Rc::new(self.prog.prefix.as_bytes().to_vec()); + // let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit)); - let check_prefix = self.check_prefix(); + // let check_prefix = self.check_prefix(); let step_insts = self.step_insts(); let add_insts = self.add_insts(); let regex = &*self.original; @@ -145,120 +132,136 @@ impl<'a> NfaGen<'a> { static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names; #[allow(dead_code)] -fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, - start: usize, end: usize) -> Vec> { +fn exec<'t>( + mut caps: &mut [Option], + input: &'t str, + start: usize, +) -> bool { #![allow(unused_imports)] #![allow(unused_mut)] - use regex::native::{ - MatchKind, Exists, Location, Submatches, - StepState, StepMatchEarlyReturn, StepMatch, StepContinue, - CharReader, find_prefix, simple_case_fold, - }; + use regex::internal::{Char, CharInput, InputAt, Input, Inst}; + let input = CharInput::new(input); + let at = input.at(start); return Nfa { - which: which, input: input, - ic: 0, - chars: CharReader::new(input), - }.run(start, end); - - type Captures = [Option; $num_cap_locs]; + ncaps: caps.len(), + }.exec(&mut NfaThreads::new(), &mut caps, at); struct Nfa<'t> { - which: MatchKind, - input: &'t str, - ic: usize, - chars: CharReader<'t>, + input: CharInput<'t>, + ncaps: usize, } impl<'t> Nfa<'t> { #[allow(unused_variables)] - fn run(&mut self, start: usize, end: usize) -> Vec> { + fn exec( + &mut self, + mut q: &mut NfaThreads, + mut caps: &mut [Option], + mut at: InputAt, + ) -> bool { let mut matched = false; - let prefix_bytes: &[u8] = $prefix_bytes; - let mut clist = Threads::new(self.which); - let mut nlist = Threads::new(self.which); - let (mut clist, mut nlist) = (&mut clist, &mut nlist); - - let mut groups = $init_groups; - - self.ic = start; - let mut next_ic = self.chars.set(start); - while self.ic <= end { + let (mut clist, mut nlist) = (&mut q.clist, &mut q.nlist); + clist.empty(); nlist.empty(); +'LOOP: loop { if clist.size == 0 { - if matched { - break - } - - if $prefix_anchor && self.ic != 0 { - break + if matched || (!at.is_beginning() && $prefix_anchor) { + break; } - - $check_prefix + // TODO: Prefix matching... Hmm. + // Prefix matching now uses a DFA, so I think this is + // going to require encoding that DFA statically. } if clist.size == 0 || (!$prefix_anchor && !matched) { - self.add(&mut clist, 0, &mut groups) + self.add(clist, &mut caps, 0, at); } - - self.ic = next_ic; - next_ic = self.chars.advance(); - + let at_next = self.input.at(at.next_pos()); for i in 0..clist.size { let pc = clist.pc(i); - let step_state = self.step(&mut groups, &mut nlist, - clist.groups(i), pc); - match step_state { - StepMatchEarlyReturn => - return vec![Some(0), Some(0)], - StepMatch => { matched = true; break }, - StepContinue => {}, + let tcaps = clist.caps(i); + if self.step(nlist, caps, tcaps, pc, at, at_next) { + matched = true; + if caps.len() == 0 { + break 'LOOP; + } + break; } } + if at.char().is_none() { + break; + } + at = at_next; ::std::mem::swap(&mut clist, &mut nlist); nlist.empty(); } - match self.which { - Exists if matched => vec![Some(0), Some(0)], - Exists => vec![None, None], - Location | Submatches => groups.iter().map(|x| *x).collect(), - } + matched } // Sometimes `nlist` is never used (for empty regexes). #[allow(unused_variables)] #[inline] - fn step(&self, groups: &mut Captures, nlist: &mut Threads, - caps: &mut Captures, pc: usize) -> StepState { - $step_insts - StepContinue + fn step( + &self, + nlist: &mut Threads, + caps: &mut [Option], + thread_caps: &mut [Option], + pc: usize, + at: InputAt, + at_next: InputAt, + ) -> bool { + $step_insts; + false } - fn add(&self, nlist: &mut Threads, pc: usize, - groups: &mut Captures) { + fn add( + &self, + nlist: &mut Threads, + thread_caps: &mut [Option], + pc: usize, + at: InputAt, + ) { if nlist.contains(pc) { - return + return; } + let ti = nlist.add(pc); $add_insts } } - struct Thread { - pc: usize, - groups: Captures, + struct NfaThreads { + clist: Threads, + nlist: Threads, } struct Threads { - which: MatchKind, - queue: [Thread; $num_insts], + dense: [Thread; $num_insts], sparse: [usize; $num_insts], size: usize, } + struct Thread { + pc: usize, + caps: [Option; $num_cap_locs], + } + + impl NfaThreads { + fn new() -> NfaThreads { + NfaThreads { + clist: Threads::new(), + nlist: Threads::new(), + } + } + + fn swap(&mut self) { + ::std::mem::swap(&mut self.clist, &mut self.nlist); + } + } + impl Threads { - fn new(which: MatchKind) -> Threads { + fn new() -> Threads { Threads { - which: which, // These unsafe blocks are used for performance reasons, as it // gives us a zero-cost initialization of a sparse set. The // trick is described in more detail here: @@ -266,43 +269,30 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, // The idea here is to avoid initializing threads that never // need to be initialized, particularly for larger regexs with // a lot of instructions. - queue: unsafe { ::std::mem::uninitialized() }, + dense: unsafe { ::std::mem::uninitialized() }, sparse: unsafe { ::std::mem::uninitialized() }, size: 0, } } #[inline] - fn add(&mut self, pc: usize, groups: &Captures) { - let t = &mut self.queue[self.size]; - t.pc = pc; - match self.which { - Exists => {}, - Location => { - t.groups[0] = groups[0]; - t.groups[1] = groups[1]; - } - Submatches => { - for (slot, val) in t.groups.iter_mut().zip(groups.iter()) { - *slot = *val; - } - } - } - self.sparse[pc] = self.size; + fn add(&mut self, pc: usize) -> usize { + let i = self.size; + self.dense[i].pc = pc; + self.sparse[pc] = i; self.size += 1; + i } #[inline] - fn add_empty(&mut self, pc: usize) { - self.queue[self.size].pc = pc; - self.sparse[pc] = self.size; - self.size += 1; + fn thread(&mut self, i: usize) -> &mut Thread { + &mut self.dense[i] } #[inline] fn contains(&self, pc: usize) -> bool { let s = self.sparse[pc]; - s < self.size && self.queue[s].pc == pc + s < self.size && self.dense[s].pc == pc } #[inline] @@ -312,17 +302,17 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, #[inline] fn pc(&self, i: usize) -> usize { - self.queue[i].pc + self.dense[i].pc } #[inline] - fn groups<'r>(&'r mut self, i: usize) -> &'r mut Captures { - &mut self.queue[i].groups + fn caps<'r>(&'r mut self, i: usize) -> &'r mut [Option] { + &mut self.dense[i].caps } } } -::regex::native::Native(::regex::native::ExNative { +::regex::internal::Native(::regex::internal::ExNative { original: $regex, names: &CAP_NAMES, prog: exec, @@ -336,104 +326,78 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - Inst::StartLine => { + Inst::EmptyLook(LookInst::StartLine) => { quote_expr!(self.cx, { - nlist.add_empty($pc); - if self.chars.is_begin() || self.chars.prev == Some('\n') { - self.add(nlist, $nextpc, &mut *groups) + let prev = self.input.previous_at(at.pos()); + if prev.char().is_none() || prev.char() == '\n' { + self.add(nlist, thread_caps, $nextpc, at); } }) } - Inst::StartText => { + Inst::EmptyLook(LookInst::EndLine) => { quote_expr!(self.cx, { - nlist.add_empty($pc); - if self.chars.is_begin() { - self.add(nlist, $nextpc, &mut *groups) + if at.char().is_none() || at.char() == '\n' { + self.add(nlist, thread_caps, $nextpc, at); } }) } - Inst::EndLine => { + Inst::EmptyLook(LookInst::StartText) => { quote_expr!(self.cx, { - nlist.add_empty($pc); - if self.chars.is_end() || self.chars.cur == Some('\n') { - self.add(nlist, $nextpc, &mut *groups) + let prev = self.input.previous_at(at.pos()); + if prev.char().is_none() { + self.add(nlist, thread_caps, $nextpc, at); } }) } - Inst::EndText => { + Inst::EmptyLook(LookInst::EndText) => { quote_expr!(self.cx, { - nlist.add_empty($pc); - if self.chars.is_end() { - self.add(nlist, $nextpc, &mut *groups) + if at.char().is_none() { + self.add(nlist, thread_caps, $nextpc, at); } }) } - Inst::WordBoundary => { - quote_expr!(self.cx, { - nlist.add_empty($pc); - if self.chars.is_word_boundary() { - self.add(nlist, $nextpc, &mut *groups) - } - }) - } - Inst::NotWordBoundary => { + Inst::EmptyLook(ref wbty) => { + let m = if *wbty == LookInst::WordBoundary { + quote_expr!(self.cx, { w1 ^ w2 }) + } else { + quote_expr!(self.cx, { !(w1 ^ w2) }) + }; quote_expr!(self.cx, { - nlist.add_empty($pc); - if !self.chars.is_word_boundary() { - self.add(nlist, $nextpc, &mut *groups) + let prev = self.input.previous_at(at.pos()); + let w1 = prev.char().is_word_char(); + let w2 = at.char().is_word_char(); + if $m { + self.add(nlist, thread_caps, $nextpc, at); } }) } - Inst::Save(slot) => { - let save = quote_expr!(self.cx, { - let old = groups[$slot]; - groups[$slot] = Some(self.ic); - self.add(nlist, $nextpc, &mut *groups); - groups[$slot] = old; - }); - let add = quote_expr!(self.cx, { - self.add(nlist, $nextpc, &mut *groups); - }); - // If this is saving a submatch location but we request - // existence or only full match location, then we can skip - // right over it every time. - if slot > 1 { - quote_expr!(self.cx, { - nlist.add_empty($pc); - match self.which { - Submatches => $save, - Exists | Location => $add, - } - }) + Inst::Save(slot) => quote_expr!(self.cx, { + if $slot >= self.ncaps { + self.add(nlist, thread_caps, $nextpc, at); } else { - quote_expr!(self.cx, { - nlist.add_empty($pc); - match self.which { - Submatches | Location => $save, - Exists => $add, - } - }) + let old = thread_caps[$slot]; + thread_caps[$slot] = Some(at.pos()); + self.add(nlist, thread_caps, $nextpc, at); + thread_caps[$slot] = old; } - } - Inst::Jump(to) => { - quote_expr!(self.cx, { - nlist.add_empty($pc); - self.add(nlist, $to, &mut *groups); - }) - } - Inst::Split(x, y) => { - quote_expr!(self.cx, { - nlist.add_empty($pc); - self.add(nlist, $x, &mut *groups); - self.add(nlist, $y, &mut *groups); - }) - } + }), + Inst::Jump(to) => quote_expr!(self.cx, { + self.add(nlist, thread_caps, $to, at); + }), + Inst::Split(x, y) => quote_expr!(self.cx, { + self.add(nlist, thread_caps, $x, at); + self.add(nlist, thread_caps, $y, at); + }), // For Match, OneChar, CharClass, Any, AnyNoNL - _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), + _ => quote_expr!(self.cx, { + let mut t = &mut nlist.thread(ti); + for (slot, val) in t.caps.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + }), }; self.arm_inst(pc, body) }).collect::>(); - self.match_insts(arms) } @@ -443,77 +407,35 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - Inst::Match => { - quote_expr!(self.cx, { - match self.which { - Exists => { - return StepMatchEarlyReturn - } - Location => { - groups[0] = caps[0]; - groups[1] = caps[1]; - return StepMatch - } - Submatches => { - for (slot, val) in groups.iter_mut().zip(caps.iter()) { - *slot = *val; - } - return StepMatch - } - } - }) - } - Inst::OneChar { c, casei } => { - if casei { - let upc = simple_case_fold(c); - quote_expr!(self.cx, { - let upc = self.chars.prev.map(simple_case_fold); - if upc == Some($upc) { - self.add(nlist, $nextpc, caps); - } - }) - } else { - quote_expr!(self.cx, { - if self.chars.prev == Some($c) { - self.add(nlist, $nextpc, caps); - } - }) + Inst::Match => quote_expr!(self.cx, { + for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { + *slot = *val; } - } - Inst::CharClass(ref cls) => { - let ranges: Vec<(char, char)> = - cls.iter().map(|r| (r.start, r.end)).collect(); - let mranges = self.match_class(&ranges); - let get_char = - if cls.is_case_insensitive() { - quote_expr!( - self.cx, - simple_case_fold(self.chars.prev.unwrap())) - } else { - quote_expr!(self.cx, self.chars.prev.unwrap()) - }; + return true; + }), + Inst::Char(OneChar { c, casei }) => quote_expr!(self.cx, { + if $c == at.char() || ($casei && $c == at.char().case_fold()) { + self.add(nlist, thread_caps, $nextpc, at_next); + } + return false; + }), + Inst::Ranges(CharRanges { ref ranges, casei }) => { + let match_class = self.match_class(ranges); quote_expr!(self.cx, { - if self.chars.prev.is_some() { - let c = $get_char; - if $mranges { - self.add(nlist, $nextpc, caps); - } + let mut c = at.char(); + if $casei { + c = c.case_fold(); } - }) - } - Inst::Any => { - quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) - } - Inst::AnyNoNL => { - quote_expr!(self.cx, { - if self.chars.prev != Some('\n') { - self.add(nlist, $nextpc, caps); + if let Some(c) = c.as_char() { + if $match_class { + self.add(nlist, thread_caps, $nextpc, at_next); + } } - () + return false; }) } - // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split - _ => self.empty_block(), + // EmptyLook, Save, Jump, Split + _ => quote_expr!(self.cx, { return false; }), }; self.arm_inst(pc, body) }).collect::>(); @@ -526,13 +448,13 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, // table). fn match_class(&self, ranges: &[(char, char)]) -> P { let mut arms = ranges.iter().map(|&(start, end)| { - let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start), - quote_expr!(self.cx, $end))); + let pat = self.cx.pat( + self.sp, ast::PatRange( + quote_expr!(self.cx, $start), quote_expr!(self.cx, $end))); self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true)) }).collect::>(); arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); - let match_on = quote_expr!(self.cx, c); self.cx.expr_match(self.sp, match_on, arms) } @@ -540,24 +462,24 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, // Generates code for checking a literal prefix of the search string. // The code is only generated if the regex *has* a literal prefix. // Otherwise, a no-op is returned. - fn check_prefix(&self) -> P { - if self.prog.prefix.len() == 0 { - self.empty_block() - } else { - quote_expr!(self.cx, - if clist.size == 0 { - let haystack = &self.input.as_bytes()[self.ic..]; - match find_prefix(prefix_bytes, haystack) { - None => break, - Some(i) => { - self.ic += i; - next_ic = self.chars.set(self.ic); - } - } - } - ) - } - } + // fn check_prefix(&self) -> P { + // if self.prog.prefixes.len() == 0 { + // self.empty_block() + // } else { + // quote_expr!(self.cx, + // if clist.size == 0 { + // let haystack = &self.input.as_bytes()[self.ic..]; + // match find_prefix(prefix_bytes, haystack) { + // None => break, + // Some(i) => { + // self.ic += i; + // next_ic = self.chars.set(self.ic); + // } + // } + // } + // ) + // } + // } // Builds a `match pc { ... }` expression from a list of arms, specifically // for matching the current program counter with an instruction. @@ -595,7 +517,6 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, } } - // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` // on each element in `xs`. fn vec_expr>(&self, xs: It, diff --git a/regex_macros/tests/test_dynamic.rs b/regex_macros/tests/test_dynamic.rs index 1056adfb29..7fa505a2ae 100644 --- a/regex_macros/tests/test_dynamic.rs +++ b/regex_macros/tests/test_dynamic.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -15,14 +15,15 @@ extern crate regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. -macro_rules! regex( - ($re:expr) => ( - match ::regex::Regex::new($re) { - Ok(re) => re, - Err(err) => panic!("{}", err), - } - ); -); +// +// This is also used to test the various matching engines. This one exercises +// the normal code path which automatically chooses the engine based on the +// regex and the input. Other dynamic tests explicitly set the engine to use. +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::with_engine(None, 10 * (1 << 20), $re).unwrap() + } +} #[cfg(feature = "pattern")] macro_rules! searcher_expr { ($e:expr) => ($e) } diff --git a/regex_macros/tests/test_dynamic_backtrack.rs b/regex_macros/tests/test_dynamic_backtrack.rs new file mode 100644 index 0000000000..8f5d0dfa9a --- /dev/null +++ b/regex_macros/tests/test_dynamic_backtrack.rs @@ -0,0 +1,27 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(feature = "pattern", feature(core))] + +extern crate regex; + +macro_rules! regex { + ($re:expr) => {{ + let e = Some(::regex::internal::MatchEngine::Backtrack); + ::regex::Regex::with_engine(e, 10 * (1 << 20), $re).unwrap() + }} +} + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } + +mod tests; diff --git a/regex_macros/tests/test_dynamic_nfa.rs b/regex_macros/tests/test_dynamic_nfa.rs new file mode 100644 index 0000000000..e5da5c879e --- /dev/null +++ b/regex_macros/tests/test_dynamic_nfa.rs @@ -0,0 +1,27 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(feature = "pattern", feature(core))] + +extern crate regex; + +macro_rules! regex { + ($re:expr) => {{ + let e = Some(::regex::internal::MatchEngine::Nfa); + ::regex::Regex::with_engine(e, 10 * (1 << 20), $re).unwrap() + }} +} + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } + +mod tests; diff --git a/run-shootout-test b/run-shootout-test new file mode 100755 index 0000000000..8fef4e3a5e --- /dev/null +++ b/run-shootout-test @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +cargo build --example shootout-regex-dna +exec diff \ + ./examples/regexdna-output.txt \ + <(./target/debug/examples/shootout-regex-dna < ./examples/regexdna-input.txt) diff --git a/src/backtrack.rs b/src/backtrack.rs new file mode 100644 index 0000000000..e397b2ebf9 --- /dev/null +++ b/src/backtrack.rs @@ -0,0 +1,269 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// This is the backtracking matching engine. It has the same exact capability +// as the full NFA simulation, except it is artificially restricted to small +// regexes on small inputs because of its memory requirements. +// +// In particular, this is a *bounded* backtracking engine. It retains worst +// case linear time by keeping track of the states that is has visited (using a +// bitmap). Namely, once a state is visited, it is never visited again. Since a +// state is keyed by `(instruction index, input index)`, we have that its time +// complexity is `O(mn)`. +// +// The backtracking engine can beat out the NFA simulation on small +// regexes/inputs because it doesn't have to keep track of multiple copies of +// the capture groups. In benchmarks, the backtracking engine is roughly twice +// as fast as the full NFA simulation. + +use input::{Input, InputAt, CharInput}; +use program::{Inst, InstIdx, Program}; +use re::CaptureIdxs; + +type Bits = u32; +const BIT_SIZE: usize = 32; +const MAX_PROG_SIZE: usize = 100; +const MAX_INPUT_SIZE: usize = 256 * (1 << 10); + +// Total memory usage in bytes is determined by: +// +// ((len(insts) * (len(input) + 1) + bits - 1) / bits) / (bits / 8) +// +// With the above settings, this comes out to ~3.2MB. Mostly these numbers +// were picked empirically with suspicious benchmarks. + +/// A backtracking matching engine. +#[derive(Debug)] +pub struct Backtrack<'r, 't, 'c> { + prog: &'r Program, + input: CharInput<'t>, + caps: &'c mut CaptureIdxs, + m: BackMachine, +} + +/// Shared cached state between multiple invocations of a backtracking engine +/// in the same thread. +/// +/// It is exported so that it can be cached by `program::Program`. +#[derive(Debug)] +pub struct BackMachine { + jobs: Vec, + visited: Vec, +} + +impl BackMachine { + /// Create new empty state for the backtracking engine. + pub fn new() -> BackMachine { + BackMachine { + jobs: vec![], + visited: vec![], + } + } +} + +/// A job is an explicit unit of stack space in the backtracking engine. +/// +/// The "normal" representation is a single state transition, which corresponds +/// to an NFA state and a character in the input. However, the backtracking +/// engine must keep track of old capture group values. We use the explicit +/// stack to do it. +#[derive(Clone, Copy, Debug)] +enum Job { + Inst { pc: InstIdx, at: InputAt }, + SaveRestore { slot: usize, old_pos: Option }, +} + +impl<'r, 't, 'c> Backtrack<'r, 't, 'c> { + /// Execute the backtracking matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + mut caps: &mut CaptureIdxs, + text: &'t str, + start: usize, + ) -> bool { + let input = CharInput::new(text); + let start = input.at(start); + let m = prog.backtrack.get(); + let mut b = Backtrack { + prog: prog, + input: input, + caps: caps, + m: m, + }; + let matched = b.exec_(start); + prog.backtrack.put(b.m); + matched + } + + /// Returns true iff the given regex and input can be executed by this + /// engine with reasonable memory usage. + pub fn should_exec(prog: &'r Program, input: &str) -> bool { + prog.insts.len() <= MAX_PROG_SIZE && input.len() <= MAX_INPUT_SIZE + } + + fn clear(&mut self) { + // Reset the job memory so that we start fresh. + self.m.jobs.truncate(0); + + // Now we need to clear the bit state set. + // We do this by figuring out how much space we need to keep track + // of the states we've visited. + // Then we reset all existing allocated space to 0. + // Finally, we request more space if we need it. + // + // This is all a little circuitous, but doing this unsafely + // doesn't seem to have a measurable impact on performance. + // (Probably because backtracking is limited to such small + // inputs/regexes in the first place.) + let visited_len = + (self.prog.insts.len() * (self.input.len() + 1) + BIT_SIZE - 1) + / + BIT_SIZE; + for v in &mut self.m.visited { + *v = 0; + } + let cur_visited_cap = self.m.visited.capacity(); + if visited_len > cur_visited_cap { + self.m.visited.reserve_exact(visited_len - cur_visited_cap); + for _ in 0..(visited_len - cur_visited_cap) { + self.m.visited.push(0); + } + } + } + + fn exec_(&mut self, mut at: InputAt) -> bool { + self.clear(); + if self.prog.anchored_begin { + return if !at.is_beginning() { + false + } else { + match self.input.prefix_at(&self.prog.prefixes, at) { + None => false, + Some(at) => self.backtrack(at), + } + }; + } + loop { + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => return false, + Some(at) => at, + }; + } + if self.backtrack(at) { + return true; + } + if at.char().is_none() { + return false; + } + at = self.input.at(at.next_pos()); + } + } + + // This `inline(always)` seems to result in about a 10-15% increase in + // throughput on the `hard` benchmarks (over a standard `inline`). ---AG + #[inline(always)] + fn backtrack(&mut self, start: InputAt) -> bool { + self.push(0, start); + while let Some(job) = self.m.jobs.pop() { + match job { + Job::Inst { pc, at } => { + if self.step(pc, at) { + return true; + } + } + Job::SaveRestore { slot, old_pos } => { + self.caps[slot] = old_pos; + } + } + } + false + } + + fn step(&mut self, mut pc: InstIdx, mut at: InputAt) -> bool { + use program::Inst::*; + loop { + // This loop is an optimization to avoid constantly pushing/popping + // from the stack. Namely, if we're pushing a job only to run it + // next, avoid the push and just mutate `pc` (and possibly `at`) + // in place. + match self.prog.insts[pc] { + Match => return true, + Save(slot) => { + if slot < self.caps.len() { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + let old_pos = self.caps[slot]; + self.push_save_restore(slot, old_pos); + self.caps[slot] = Some(at.pos()); + } + pc += 1; + } + Jump(pc2) => pc = pc2, + Split(x, y) => { + self.push(y, at); + pc = x; + } + EmptyLook(ref inst) => { + let prev = self.input.previous_at(at.pos()); + if inst.matches(prev.char(), at.char()) { + pc += 1; + } else { + return false; + } + } + Char(ref inst) => { + if inst.matches(at.char()) { + pc += 1; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Ranges(ref inst) => { + if inst.matches(at.char()).is_some() { + pc += 1; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + } + if self.has_visited(pc, at) { + return false; + } + } + } + + fn push(&mut self, pc: InstIdx, at: InputAt) { + self.m.jobs.push(Job::Inst { pc: pc, at: at }); + } + + fn push_save_restore(&mut self, slot: usize, old_pos: Option) { + self.m.jobs.push(Job::SaveRestore { slot: slot, old_pos: old_pos }); + } + + fn has_visited(&mut self, pc: InstIdx, at: InputAt) -> bool { + let k = pc * (self.input.len() + 1) + at.pos(); + let k1 = k / BIT_SIZE; + let k2 = (1 << (k & (BIT_SIZE - 1))) as Bits; + if self.m.visited[k1] & k2 == 0 { + self.m.visited[k1] |= k2; + false + } else { + true + } + } +} diff --git a/src/char.rs b/src/char.rs new file mode 100644 index 0000000000..43661717ca --- /dev/null +++ b/src/char.rs @@ -0,0 +1,107 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::char; +use std::cmp::Ordering; +use std::fmt; +use std::u32; + +use syntax; + +/// An inline representation of `Option`. +/// +/// This eliminates the need to do case analysis on `Option` to determine +/// ordinality with other characters. +/// +/// (The `Option` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { self.0 == u32::MAX } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `0` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map(|c| c.len_utf8()).unwrap_or(0) + } + + /// Returns the simple case folding of this character. + /// + /// If the character is absent, then absence is returned. + pub fn case_fold(self) -> Char { + char::from_u32(self.0).map(syntax::simple_case_fold).into() + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + char::from_u32(self.0).map(syntax::is_word_char).unwrap_or(false) + } + + /// Converts the character to a real primitive `char`. + /// + /// If the character is absent, then `None` is returned. + pub fn as_char(self) -> Option { + // This is only used in the `regex!` macro because it expands char + // classes into `match` expressions (instead of binary search). + char::from_u32(self.0) + } +} + +impl From for Char { + fn from(c: char) -> Char { Char(c as u32) } +} + +impl From> for Char { + fn from(c: Option) -> Char { + c.map(|c| c.into()).unwrap_or(Char(u32::MAX)) + } +} + +impl PartialEq for Char { + #[inline] + fn eq(&self, other: &char) -> bool { self.0 == *other as u32 } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &Char) -> bool { *self as u32 == other.0 } +} + +impl PartialOrd for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option { + (*self as u32).partial_cmp(&other.0) + } +} diff --git a/src/compile.rs b/src/compile.rs index 413da3e9e8..012328c6a3 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -8,149 +8,87 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// Enable this to squash warnings due to exporting pieces of the representation -// for use with the regex! macro. See lib.rs for explanation. - -use self::Inst::*; - -use std::cmp; use syntax::{self, Expr, Repeater}; -use Error; -pub type InstIdx = usize; +use Error; +use program::{CharRanges, Inst, InstIdx, OneChar}; -/// An instruction, the underlying unit of a compiled regular expression -#[allow(missing_docs)] -#[derive(Debug, Clone)] -pub enum Inst { - /// When a Match instruction is executed, the current thread is successful. - Match, - OneChar { c: char, casei: bool }, - CharClass(syntax::CharClass), - Any, - AnyNoNL, - StartLine, - EndLine, - StartText, - EndText, - WordBoundary, - NotWordBoundary, - Save(usize), - Jump(InstIdx), - Split(InstIdx, InstIdx), -} +type Compiled = (Vec, Vec>); -/// Program represents a compiled regular expression. Once an expression is -/// compiled, its representation is immutable and will never change. +/// A regex compiler. /// -/// All of the data in a compiled expression is wrapped in "MaybeStatic" or -/// "MaybeOwned" types so that a `Program` can be represented as static data. -/// (This makes it convenient and efficient for use with the `regex!` macro.) -#[derive(Clone, Debug)] -pub struct Program { - /// A sequence of instructions. - pub insts: Vec, - /// If the regular expression requires a literal prefix in order to have a - /// match, that prefix is stored here. (It's used in the VM to implement - /// an optimization.) - pub prefix: String, +/// A regex compiler is responsible for turning a regex's AST into a sequence +/// of instructions. +pub struct Compiler { + size_limit: usize, + insts: Vec, + cap_names: Vec>, } -impl Program { - /// Compiles a Regex given its AST. - pub fn new(ast: Expr, size: usize) -> Result<(Program, Vec>), Error> { - let mut c = Compiler { - insts: Vec::with_capacity(100), - names: vec![None], - size_limit: size, - }; - - c.insts.push(Save(0)); - try!(c.compile(ast)); - c.insts.push(Save(1)); - c.insts.push(Match); - - // Try to discover a literal string prefix. - // This is a bit hacky since we have to skip over the initial - // 'Save' instruction. - let mut pre = String::with_capacity(5); - for inst in c.insts[1..].iter() { - match *inst { - OneChar { c, casei: false } => pre.push(c), - _ => break - } +impl Compiler { + /// Creates a new compiler that limits the size of the regex program + /// to the size given (in bytes). + pub fn new(size_limit: usize) -> Compiler { + Compiler { + size_limit: size_limit, + insts: vec![], + cap_names: vec![None], } - - let Compiler { insts, names, .. } = c; - let prog = Program { - insts: insts, - prefix: pre, - }; - Ok((prog, names)) } - /// Returns the total number of capture groups in the regular expression. - /// This includes the zeroth capture. - pub fn num_captures(&self) -> usize { - let mut n = 0; - for inst in self.insts.iter() { - match *inst { - Save(c) => n = cmp::max(n, c+1), - _ => {} - } - } - // There's exactly 2 Save slots for every capture. - n / 2 + /// Compiles the given regex AST into a tuple of a sequence of + /// instructions and a sequence of capture groups, optionally named. + pub fn compile(mut self, ast: Expr) -> Result { + self.insts.push(Inst::Save(0)); + try!(self.c(ast)); + self.insts.push(Inst::Save(1)); + self.insts.push(Inst::Match); + Ok((self.insts, self.cap_names)) } -} -struct Compiler { - insts: Vec, - names: Vec>, - size_limit: usize, -} - -// The compiler implemented here is extremely simple. Most of the complexity -// in this crate is in the parser or the VM. -// The only tricky thing here is patching jump/split instructions to point to -// the right instruction. -impl Compiler { - fn check_size(&self) -> Result<(), Error> { - if self.insts.len() * ::std::mem::size_of::() > self.size_limit { - Err(Error::CompiledTooBig(self.size_limit)) - } else { - Ok(()) - } - } + fn c(&mut self, ast: Expr) -> Result<(), Error> { + use program::Inst::*; + use program::LookInst::*; - fn compile(&mut self, ast: Expr) -> Result<(), Error> { match ast { Expr::Empty => {}, Expr::Literal { chars, casei } => { - for c in chars { - self.push(OneChar { c: c, casei: casei }); + for mut c in chars { + if casei { + c = syntax::simple_case_fold(c); + } + self.push(Char(OneChar { c: c, casei: casei })); + } + } + Expr::AnyChar => self.push(Ranges(CharRanges::any())), + Expr::AnyCharNoNL => self.push(Ranges(CharRanges::any_nonl())), + Expr::Class(cls) => { + if cls.len() == 1 && cls[0].start == cls[0].end { + self.push(Char(OneChar { + c: cls[0].start, + casei: cls.is_case_insensitive(), + })); + } else { + self.push(Ranges(CharRanges::from_class(cls))); } } - Expr::AnyChar => self.push(Any), - Expr::AnyCharNoNL => self.push(AnyNoNL), - Expr::Class(cls) => self.push(CharClass(cls)), - Expr::StartLine => self.push(StartLine), - Expr::EndLine => self.push(EndLine), - Expr::StartText => self.push(StartText), - Expr::EndText => self.push(EndText), - Expr::WordBoundary => self.push(WordBoundary), - Expr::NotWordBoundary => self.push(NotWordBoundary), - Expr::Group { e, i: None, name: None } => try!(self.compile(*e)), + Expr::StartLine => self.push(EmptyLook(StartLine)), + Expr::EndLine => self.push(EmptyLook(EndLine)), + Expr::StartText => self.push(EmptyLook(StartText)), + Expr::EndText => self.push(EmptyLook(EndText)), + Expr::WordBoundary => self.push(EmptyLook(WordBoundary)), + Expr::NotWordBoundary => self.push(EmptyLook(NotWordBoundary)), + Expr::Group { e, i: None, name: None } => try!(self.c(*e)), Expr::Group { e, i, name } => { let i = i.expect("capture index"); - self.names.push(name); + self.cap_names.push(name); self.push(Save(2 * i)); - try!(self.compile(*e)); + try!(self.c(*e)); self.push(Save(2 * i + 1)); } Expr::Concat(es) => { for e in es { - try!(self.compile(e)); + try!(self.c(e)); } } Expr::Alternate(mut es) => { @@ -160,26 +98,26 @@ impl Compiler { } let e1 = es.remove(0); if es.len() == 0 { - try!(self.compile(e1)); + try!(self.c(e1)); return Ok(()); } let e2 = Expr::Alternate(es); // this causes recursion - let split = self.empty_split(); // push: split 0, 0 + let split = self.empty_split(); let j1 = self.insts.len(); - try!(self.compile(e1)); // push: insts for x - let jmp = self.empty_jump(); // push: jmp 0 + try!(self.c(e1)); + let jmp = self.empty_jump(); let j2 = self.insts.len(); - try!(self.compile(e2)); // push: insts for y + try!(self.c(e2)); let j3 = self.insts.len(); - self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 - self.set_jump(jmp, j3); // jmp 0 -> jmp j3 + self.set_split(split, j1, j2); + self.set_jump(jmp, j3); } Expr::Repeat { e, r: Repeater::ZeroOrOne, greedy } => { let split = self.empty_split(); let j1 = self.insts.len(); - try!(self.compile(*e)); + try!(self.c(*e)); let j2 = self.insts.len(); if greedy { @@ -192,7 +130,7 @@ impl Compiler { let j1 = self.insts.len(); let split = self.empty_split(); let j2 = self.insts.len(); - try!(self.compile(*e)); + try!(self.c(*e)); let jmp = self.empty_jump(); let j3 = self.insts.len(); @@ -205,7 +143,7 @@ impl Compiler { } Expr::Repeat { e, r: Repeater::OneOrMore, greedy } => { let j1 = self.insts.len(); - try!(self.compile(*e)); + try!(self.c(*e)); let split = self.empty_split(); let j2 = self.insts.len(); @@ -215,24 +153,32 @@ impl Compiler { self.set_split(split, j2, j1); } } - Expr::Repeat { e, r: Repeater::Range { min, max: None }, greedy } => { + Expr::Repeat { + e, + r: Repeater::Range { min, max: None }, + greedy, + } => { let e = *e; for _ in 0..min { - try!(self.compile(e.clone())); + try!(self.c(e.clone())); } - try!(self.compile(Expr::Repeat { + try!(self.c(Expr::Repeat { e: Box::new(e), r: Repeater::ZeroOrMore, greedy: greedy, })); } - Expr::Repeat { e, r: Repeater::Range { min, max: Some(max) }, greedy } => { + Expr::Repeat { + e, + r: Repeater::Range { min, max: Some(max) }, + greedy, + } => { let e = *e; for _ in 0..min { - try!(self.compile(e.clone())); + try!(self.c(e.clone())); } for _ in min..max { - try!(self.compile(Expr::Repeat { + try!(self.c(Expr::Repeat { e: Box::new(e.clone()), r: Repeater::ZeroOrOne, greedy: greedy, @@ -243,6 +189,16 @@ impl Compiler { self.check_size() } + fn check_size(&self) -> Result<(), Error> { + use std::mem::size_of; + + if self.insts.len() * size_of::() > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } + /// Appends the given instruction to the program. #[inline] fn push(&mut self, x: Inst) { @@ -254,7 +210,7 @@ impl Compiler { /// the actual locations of the split in later.) #[inline] fn empty_split(&mut self) -> InstIdx { - self.insts.push(Split(0, 0)); + self.insts.push(Inst::Split(0, 0)); self.insts.len() - 1 } @@ -266,7 +222,7 @@ impl Compiler { fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) { let split = &mut self.insts[i]; match *split { - Split(_, _) => *split = Split(pc1, pc2), + Inst::Split(_, _) => *split = Inst::Split(pc1, pc2), _ => panic!("BUG: Invalid split index."), } } @@ -275,7 +231,7 @@ impl Compiler { /// index of that instruction. #[inline] fn empty_jump(&mut self) -> InstIdx { - self.insts.push(Jump(0)); + self.insts.push(Inst::Jump(0)); self.insts.len() - 1 } @@ -286,7 +242,7 @@ impl Compiler { fn set_jump(&mut self, i: InstIdx, pc: InstIdx) { let jmp = &mut self.insts[i]; match *jmp { - Jump(_) => *jmp = Jump(pc), + Inst::Jump(_) => *jmp = Inst::Jump(pc), _ => panic!("BUG: Invalid jump index."), } } diff --git a/src/input.rs b/src/input.rs new file mode 100644 index 0000000000..446872bb46 --- /dev/null +++ b/src/input.rs @@ -0,0 +1,114 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::ops; + +use char::Char; +use prefix::Prefix; + +/// Represents a location in the input. +#[derive(Clone, Copy, Debug)] +pub struct InputAt { + pos: usize, + c: Char, + len: usize, +} + +impl InputAt { + /// Returns true iff this position is at the beginning of the input. + pub fn is_beginning(&self) -> bool { + self.pos == 0 + } + + /// Returns the character at this position. + /// + /// If this position is just before or after the input, then an absent + /// character is returned. + pub fn char(&self) -> Char { + self.c + } + + /// Returns the UTF-8 width of the character at this position. + pub fn len(&self) -> usize { + self.len + } + + /// Returns the byte offset of this position. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the byte offset of the next position in the input. + pub fn next_pos(&self) -> usize { + self.pos + self.len + } +} + +/// An abstraction over input used in the matching engines. +pub trait Input { + /// Return an encoding of the position at byte offset `i`. + fn at(&self, i: usize) -> InputAt; + /// Return an encoding of the char position just prior to byte offset `i`. + fn previous_at(&self, i: usize) -> InputAt; + /// Scan the input for a matching prefix. + fn prefix_at(&self, prefixes: &Prefix, at: InputAt) -> Option; +} + +/// An input reader over characters. +/// +/// (This is the only implementation of `Input` at the moment.) +#[derive(Debug)] +pub struct CharInput<'t>(&'t str); + +impl<'t> CharInput<'t> { + /// Return a new character input reader for the given string. + pub fn new(s: &'t str) -> CharInput<'t> { + CharInput(s) + } +} + +impl<'t> ops::Deref for CharInput<'t> { + type Target = str; + + fn deref(&self) -> &str { + self.0 + } +} + +impl<'t> Input for CharInput<'t> { + // This `inline(always)` increases throughput by almost 25% on the `hard` + // benchmarks over a normal `inline` annotation. + // + // I'm not sure why `#[inline]` isn't enough to convince LLVM, but it is + // used *a lot* in the guts of the matching engines. + #[inline(always)] + fn at(&self, i: usize) -> InputAt { + let c = self[i..].chars().next().into(); + InputAt { + pos: i, + c: c, + len: c.len_utf8(), + } + } + + fn previous_at(&self, i: usize) -> InputAt { + let c: Char = self[..i].chars().rev().next().into(); + let len = c.len_utf8(); + InputAt { + pos: i - len, + c: c, + len: len, + } + } + + fn prefix_at(&self, prefixes: &Prefix, at: InputAt) -> Option { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } +} diff --git a/src/lib.rs b/src/lib.rs index d63c98dad4..9de4c99e84 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -66,8 +66,10 @@ //! compiles*. Said differently, if you only use `regex!` to build regular //! expressions in your program, then your program cannot compile with an //! invalid regular expression. Moreover, the `regex!` macro compiles the -//! given expression to native Rust code, which makes it much faster for -//! searching text. +//! given expression to native Rust code, which ideally makes it faster. +//! Unfortunately (or fortunately), the dynamic implementation has had a lot +//! more optimization work put it into it currently, so it is faster than +//! the `regex!` macro in most cases. //! //! To use the `regex!` macro, you must enable the `plugin` feature and import //! the `regex_macros` crate as a syntax extension: @@ -88,7 +90,7 @@ //! Secondly, the `regex` crate *must* be linked with the name `regex` since //! the generated code depends on finding symbols in the `regex` crate. //! -//! The only downside of using the `regex!` macro is that it can increase the +//! One downside of using the `regex!` macro is that it can increase the //! size of your program's binary since it generates specialized Rust code. //! The extra size probably won't be significant for a small number of //! expressions, but 100+ calls to `regex!` will probably result in a @@ -394,6 +396,8 @@ html_favicon_url = "http://www.rust-lang.org/favicon.ico", html_root_url = "http://doc.rust-lang.org/regex/")] +extern crate aho_corasick; +extern crate memchr; extern crate regex_syntax as syntax; pub use re::{ @@ -403,36 +407,25 @@ pub use re::{ quote, is_match, }; +mod backtrack; +mod char; mod compile; +mod input; +mod pool; +mod prefix; +mod program; +mod nfa; mod re; -mod vm; -/// The `native` module exists to support the `regex!` macro. Do not use. +/// The `internal` module exists to support the `regex!` macro and other +/// suspicious activity, such as testing different matching engines. #[doc(hidden)] -pub mod native { - // Exporting this stuff is bad form, but it's necessary for two reasons. - // Firstly, the `regex!` syntax extension is in a different crate and - // requires access to the representation of a regex (particularly the - // instruction set) in order to compile to native Rust. This could be - // mitigated if `regex!` was defined in the same crate, but this has - // undesirable consequences (such as requiring a dependency on - // `libsyntax`). - // - // Secondly, the code generated by `regex!` must *also* be able - // to access various functions in this crate to reduce code duplication - // and to provide a value with precisely the same `Regex` type in this - // crate. This, AFAIK, is impossible to mitigate. - // - // On the bright side, `rustdoc` lets us hide this from the public API - // documentation. - pub use compile::Program; - pub use compile::Inst; - pub use syntax::simple_case_fold; - pub use re::{ExDynamic, ExNative}; - pub use re::Regex::{Dynamic, Native}; - pub use vm::{CharReader, find_prefix}; - pub use vm::MatchKind::{self, Exists, Location, Submatches}; - pub use vm::StepState::{ - self, StepMatchEarlyReturn, StepMatch, StepContinue, +pub mod internal { + pub use char::Char; + pub use input::{Input, CharInput, InputAt}; + pub use program::{ + Program, MatchEngine, CharRanges, Inst, LookInst, OneChar, }; + pub use re::ExNative; + pub use re::Regex::{Dynamic, Native}; } diff --git a/src/nfa.rs b/src/nfa.rs new file mode 100644 index 0000000000..408a150aec --- /dev/null +++ b/src/nfa.rs @@ -0,0 +1,300 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// FIXME: Currently, the VM simulates an NFA. It would be nice to have another +// VM that simulates a DFA. +// +// According to Russ Cox[1], a DFA performs better than an NFA, principally +// because it reuses states previously computed by the machine *and* doesn't +// keep track of capture groups. The drawback of a DFA (aside from its +// complexity) is that it can't accurately return the locations of submatches. +// The NFA *can* do that. (This is my understanding anyway.) +// +// Cox suggests that a DFA ought to be used to answer "does this match" and +// "where does it match" questions. (In the latter, the starting position of +// the match is computed by executing the regex backwards.) Cox also suggests +// that a DFA should be run when asking "where are the submatches", which can +// 1) quickly answer "no" is there's no match and 2) discover the substring +// that matches, which means running the NFA on smaller input. +// +// Currently, the NFA simulation implemented below does some dirty tricks to +// avoid tracking capture groups when they aren't needed (which only works +// for 'is_match', not 'find'). This is a half-measure, but does provide some +// perf improvement. +// +// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go. +// +// UPDATE: We now have a backtracking matching engine and a DFA for prefix +// matching. The prefix DFA is used in both the NFA simulation below and the +// backtracking engine to skip along the input quickly. +// +// [1] - http://swtch.com/~rsc/regex/regex3.html + +use input::{Input, InputAt, CharInput}; +use program::Program; +use re::CaptureIdxs; + +/// An NFA simulation matching engine. +#[derive(Debug)] +pub struct Nfa<'r, 't> { + prog: &'r Program, + input: CharInput<'t>, +} + +impl<'r, 't> Nfa<'r, 't> { + /// Execute the NFA matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + mut caps: &mut CaptureIdxs, + text: &'t str, + start: usize, + ) -> bool { + let mut q = prog.nfa_threads.get(); + let input = CharInput::new(text); + let at = input.at(start); + let matched = Nfa { + prog: prog, + input: input, + }.exec_(&mut q, &mut caps, at); + prog.nfa_threads.put(q); + matched + } + + fn exec_( + &mut self, + mut q: &mut NfaThreads, + mut caps: &mut CaptureIdxs, + mut at: InputAt, + ) -> bool { + let mut matched = false; + q.clist.empty(); q.nlist.empty(); +'LOOP: loop { + if q.clist.size == 0 { + // Three ways to bail out when our current set of threads is + // empty. + // + // 1. We have a match---so we're done exploring any possible + // alternatives. Time to quit. + // + // 2. If the expression starts with a '^' we can terminate as + // soon as the last thread dies. + if matched + || (!at.is_beginning() && self.prog.anchored_begin) { + break; + } + + // 3. If there's a literal prefix for the program, try to + // jump ahead quickly. If it can't be found, then we can + // bail out early. + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + } + + // This simulates a preceding '.*?' for every regex by adding + // a state starting at the current position in the input for the + // beginning of the program only if we don't already have a match. + if q.clist.size == 0 || (!self.prog.anchored_begin && !matched) { + self.add(&mut q.clist, &mut caps, 0, at) + } + // The previous call to "add" actually inspects the position just + // before the current character. For stepping through the machine, + // we can to look at the current character, so we advance the + // input. + let at_next = self.input.at(at.next_pos()); + for i in 0..q.clist.size { + let pc = q.clist.pc(i); + let tcaps = q.clist.caps(i); + if self.step(&mut q.nlist, caps, tcaps, pc, at, at_next) { + matched = true; + if caps.len() == 0 { + // If we only care if a match occurs (not its + // position), then we can quit right now. + break 'LOOP; + } + // We don't need to check the rest of the threads in this + // set because we've matched something ("leftmost-first"). + // However, we still need to check threads in the next set + // to support things like greedy matching. + break; + } + } + if at.char().is_none() { + break; + } + at = at_next; + q.swap(); + q.nlist.empty(); + } + matched + } + + fn step( + &self, + nlist: &mut Threads, + caps: &mut [Option], + thread_caps: &mut [Option], + pc: usize, + at: InputAt, + at_next: InputAt, + ) -> bool { + use program::Inst::*; + match self.prog.insts[pc] { + Match => { + for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + true + } + Char(ref inst) => { + if inst.matches(at.char()) { + self.add(nlist, thread_caps, pc+1, at_next); + } + false + } + Ranges(ref inst) => { + if inst.matches(at.char()).is_some() { + self.add(nlist, thread_caps, pc+1, at_next); + } + false + } + EmptyLook(_) | Save(_) | Jump(_) | Split(_, _) => false, + } + } + + fn add( + &self, + nlist: &mut Threads, + thread_caps: &mut [Option], + pc: usize, + at: InputAt, + ) { + use program::Inst::*; + + if nlist.contains(pc) { + return + } + let ti = nlist.add(pc); + match self.prog.insts[pc] { + EmptyLook(ref inst) => { + let prev = self.input.previous_at(at.pos()); + if inst.matches(prev.char(), at.char()) { + self.add(nlist, thread_caps, pc+1, at); + } + } + Save(slot) => { + if slot >= thread_caps.len() { + self.add(nlist, thread_caps, pc+1, at); + } else { + let old = thread_caps[slot]; + thread_caps[slot] = Some(at.pos()); + self.add(nlist, thread_caps, pc+1, at); + thread_caps[slot] = old; + } + } + Jump(to) => { + self.add(nlist, thread_caps, to, at) + } + Split(x, y) => { + self.add(nlist, thread_caps, x, at); + self.add(nlist, thread_caps, y, at); + } + Match | Char(_) | Ranges(_) => { + let mut t = &mut nlist.thread(ti); + for (slot, val) in t.caps.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + } + } + } +} + +/// Shared cached state between multiple invocations of a NFA engine +/// in the same thread. +/// +/// It is exported so that it can be cached by `program::Program`. +#[derive(Debug)] +pub struct NfaThreads { + clist: Threads, + nlist: Threads, +} + +#[derive(Debug)] +struct Threads { + dense: Vec, + sparse: Vec, + size: usize, +} + +#[derive(Clone, Debug)] +struct Thread { + pc: usize, + caps: Vec>, +} + +impl NfaThreads { + /// Create new empty state for the NFA engine. + pub fn new(num_insts: usize, ncaps: usize) -> NfaThreads { + NfaThreads { + clist: Threads::new(num_insts, ncaps), + nlist: Threads::new(num_insts, ncaps), + } + } + + fn swap(&mut self) { + ::std::mem::swap(&mut self.clist, &mut self.nlist); + } +} + +impl Threads { + fn new(num_insts: usize, ncaps: usize) -> Threads { + let t = Thread { pc: 0, caps: vec![None; ncaps * 2] }; + Threads { + dense: vec![t; num_insts], + sparse: vec![0; num_insts], + size: 0, + } + } + + fn add(&mut self, pc: usize) -> usize { + let i = self.size; + self.dense[i].pc = pc; + self.sparse[pc] = i; + self.size += 1; + i + } + + fn thread(&mut self, i: usize) -> &mut Thread { + &mut self.dense[i] + } + + fn contains(&self, pc: usize) -> bool { + let s = self.sparse[pc]; + s < self.size && self.dense[s].pc == pc + } + + fn empty(&mut self) { + self.size = 0; + } + + fn pc(&self, i: usize) -> usize { + self.dense[i].pc + } + + fn caps(&mut self, i: usize) -> &mut [Option] { + &mut self.dense[i].caps + } +} diff --git a/src/pool.rs b/src/pool.rs new file mode 100644 index 0000000000..cb29fb0346 --- /dev/null +++ b/src/pool.rs @@ -0,0 +1,93 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cell::RefCell; +use std::fmt; +use std::sync::Mutex; + +/// A very simple memory pool for managing cached state. +/// +/// This was motivated by a singular purpose: reduce the allocation overhead +/// of matching engines. +/// +/// With a pool, the matching engines need to allocate state each time they +/// are invoked. If a regex is used once to check for a match and never again, +/// then this is OK. But if a regex is used many times over, then not +/// re-allocating the engine's state is a huge win. (A regex is commonly +/// used many times, for example, with `find_iter`, `captures_iter` or +/// `replace_all`.) +/// +/// We use inherited mutability and ensure that each thread gets its own +/// state. There is no limit on the number of states that are created. If a +/// thread requests one and one isn't available, a new one is created. +/// +/// (N.B. It seems like there exists a way to implement this with stronger +/// guarantees, e.g., with a guard of some sort that puts the resource back +/// in the pool when it is dropped. However, the use case for this pool is so +/// simple and localized that it doesn't seem worth it.) +pub struct Pool { + stack: Mutex>>, + create: CreateFn, +} + +/// The type of the function used to create resources if none exist. +pub type CreateFn = Box T + Send + Sync>; + +impl Pool { + /// Create a new pool. + /// + /// When a caller requests a resource from the pool and one does not + /// exist, then `create` is called to allocate a new resource for the + /// caller. + /// + /// It is up to the caller to put the resource back into the pool for + /// future reuse. + /// + /// All resources are created lazily/on-demand. + pub fn new(create: CreateFn) -> Pool { + Pool { + stack: Mutex::new(RefCell::new(vec![])), + create: create, + } + } + + /// Request a resource from the pool. + /// + /// If no resources are available, a new one is created. + /// + /// The caller must return the resource to the pool, otherwise the pool + /// will not be able to reuse the resource. + pub fn get(&self) -> T { + let stack = self.stack.lock(); + let stack = stack.unwrap(); + let mut stack = stack.borrow_mut(); + match stack.pop() { + None => (self.create)(), + Some(v) => v, + } + } + + /// Add a resource to the pool. + /// + /// This makes the resource available for use with `get`. + pub fn put(&self, v: T) { + let stack = self.stack.lock(); + let stack = stack.unwrap(); + stack.borrow_mut().push(v); + } +} + +impl fmt::Debug for Pool { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let stack = self.stack.lock(); + let stack = stack.unwrap(); + stack.fmt(f) + } +} diff --git a/src/prefix.rs b/src/prefix.rs new file mode 100644 index 0000000000..d11ddcf5a1 --- /dev/null +++ b/src/prefix.rs @@ -0,0 +1,106 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use aho_corasick::AcAutomaton; +use memchr::memchr; + +/// A prefix extracted from a compiled regular expression. +/// +/// A regex prefix is a set of literal strings that *must* be matched at the +/// beginning of a regex in order for the entire regex to match. +/// +/// There are a variety of ways to efficiently scan the search text for a +/// prefix. Currently, there are two implemented: +/// +/// 1. The prefix is a single byte. Just use memchr. +/// 2. If the prefix is a set of two or more single byte prefixes, then +/// a single sparse map is created. Checking if there is a match is a lookup +/// in this map for each byte in the search text. +/// 3. In all other cases, build an Aho-Corasick automaton. +/// +/// It's possible that there's room here for other substring algorithms, +/// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp +/// for small sets of same-length prefixes. +#[derive(Clone, Debug)] +pub enum Prefix { + /// No prefixes. (Never advances through the input.) + Empty, + /// A single byte prefix. + Single(u8), + /// A set of two or more single byte prefixes. + /// This could be reduced to a bitset, which would use only 8 bytes, + /// but I don't think we care. + Singles(Vec), + /// A full Aho-Corasick DFA automaton. + Automaton(AcAutomaton), +} + +impl Prefix { + /// Create a new prefix matching machine. + pub fn new(pfxs: Vec) -> Prefix { + if pfxs.len() == 0 || pfxs[0].len() == 0 { + Prefix::Empty + } else if pfxs.len() == 1 && pfxs[0].len() == 1 { + Prefix::Single(pfxs[0].as_bytes()[0]) + } else if pfxs.len() >= 2 && pfxs.iter().all(|s| s.len() == 1) { + let mut set = vec![false; 256]; + for p in pfxs { + set[p.as_bytes()[0] as usize] = true; + } + Prefix::Singles(set) + } else { + Prefix::Automaton(AcAutomaton::new(pfxs)) + } + } + + /// Find the position of a prefix in `haystack` if it exists. + /// + /// In the matching engines, we only actually need the starting index + /// because the prefix is used to only skip ahead---the matching engine + /// still needs to run over the prefix input. However, we return the ending + /// location as well in case the prefix corresponds to the entire regex, + /// in which case, you need the end of the match. + pub fn find(&self, haystack: &str) -> Option<(usize, usize)> { + use self::Prefix::*; + match *self { + Empty => Some((0, 0)), + Single(b) => memchr(b, haystack.as_bytes()).map(|i| (i, i+1)), + Singles(ref pats) => find_singles(pats, haystack.as_bytes()), + Automaton(ref aut) => { + aut.find(haystack).next().map(|m| (m.start, m.end)) + } + } + } + + /// Returns true iff this prefix is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of prefixes in this machine. + pub fn len(&self) -> usize { + match *self { + Prefix::Empty => 0, + Prefix::Single(_) => 1, + Prefix::Singles(ref pats) => pats.len(), + Prefix::Automaton(ref aut) => aut.len(), + } + } +} + +/// A very quick scan for multiple single byte prefixes using a sparse map. +fn find_singles(pats: &[bool], haystack: &[u8]) -> Option<(usize, usize)> { + for (hi, &b) in haystack.iter().enumerate() { + if pats[b as usize] { + return Some((hi, hi+1)); + } + } + None +} diff --git a/src/program.rs b/src/program.rs new file mode 100644 index 0000000000..6986667a92 --- /dev/null +++ b/src/program.rs @@ -0,0 +1,492 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cmp::{self, Ordering}; + +use syntax; + +use Error; +use backtrack::{Backtrack, BackMachine}; +use char::Char; +use compile::Compiler; +use nfa::{Nfa, NfaThreads}; +use pool::Pool; +use prefix::Prefix; +use re::CaptureIdxs; + +const NUM_PREFIX_LIMIT: usize = 30; +const PREFIX_LENGTH_LIMIT: usize = 15; + +pub type InstIdx = usize; + +/// An instruction, the underlying unit of a compiled regular expression +#[derive(Clone, Debug)] +pub enum Inst { + /// A match has occurred. + /// This is always the last instruction and only occurs in a single spot. + /// We could special case this in the code, but it is much clearer to + /// handle it as a proper instruction. + Match, + /// Save the current location in the input into the given capture location. + Save(usize), + /// Jump to the instruction given. + Jump(InstIdx), + /// Match either instruction, preferring the first. + Split(InstIdx, InstIdx), + /// A zero-width instruction. When this instruction matches, the input + /// is not advanced. + EmptyLook(LookInst), + /// Match a single possibly case insensitive character. + Char(OneChar), + /// Match one or more possibly case insensitive character ranges. + Ranges(CharRanges), +} + +/// A single character instruction. +#[derive(Clone, Debug)] +pub struct OneChar { + /// The character. + pub c: char, + /// True if the character should be matched case insensitively. + /// (i.e., The input character will need to be case folded.) + pub casei: bool, +} + +/// A multi-range character class instruction. +#[derive(Clone, Debug)] +pub struct CharRanges { + /// Sorted sequence of non-overlapping ranges. + pub ranges: Vec<(char, char)>, + /// Whether to match case insensitively. + pub casei: bool, +} + +/// The set of zero-width match instructions. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum LookInst { + /// Start of line or input. + StartLine, + /// End of line or input. + EndLine, + /// Start of input. + StartText, + /// End of input. + EndText, + /// Word character on one side and non-word character on other. + WordBoundary, + /// Word character on both sides or non-word character on both sides. + NotWordBoundary, +} + +impl OneChar { + /// Tests whether the given input character matches this instruction. + #[inline(always)] // About ~5-15% more throughput then `#[inline]` + pub fn matches(&self, c: Char) -> bool { + self.c == c || (self.casei && self.c == c.case_fold()) + } +} + +impl CharRanges { + /// Emits a range specifically for the `.` expression. + pub fn any() -> CharRanges { + CharRanges { + ranges: vec![('\x00', '\u{10ffff}')], + casei: false, + } + } + + /// Emits a range specifically for the `(?s).` expression. + pub fn any_nonl() -> CharRanges { + CharRanges { + ranges: vec![('\x00', '\x09'), ('\x0B', '\u{10ffff}')], + casei: false, + } + } + + /// Emits a range from the AST character class. + pub fn from_class(cls: syntax::CharClass) -> CharRanges { + let casei = cls.is_case_insensitive(); + CharRanges { + ranges: cls.into_iter().map(|r| (r.start, r.end)).collect(), + casei: casei, + } + } + + /// Tests whether the given input character matches this instruction. + #[inline(always)] // About ~5-15% more throughput then `#[inline]` + pub fn matches(&self, mut c: Char) -> Option { + if self.casei { + c = c.case_fold(); + } + // This speeds up the `match_class_unicode` benchmark by checking + // some common cases quickly without binary search. e.g., Matching + // a Unicode class on predominantly ASCII text. + for i in 0..cmp::min(self.ranges.len(), 4) { + let r = self.ranges[i]; + if c < r.0 { + return None; + } + if c <= r.1 { + return Some(i); + } + } + self.ranges.binary_search_by(|r| { + if r.1 < c { + Ordering::Less + } else if r.0 > c { + Ordering::Greater + } else { + Ordering::Equal + } + }).ok() + } +} + +impl LookInst { + /// Tests whether the pair of characters matches this zero-width + /// instruction. + pub fn matches(&self, c1: Char, c2: Char) -> bool { + use self::LookInst::*; + match *self { + StartLine => c1.is_none() || c1 == '\n', + EndLine => c2.is_none() || c2 == '\n', + StartText => c1.is_none(), + EndText => c2.is_none(), + ref wbty => { + let (w1, w2) = (c1.is_word_char(), c2.is_word_char()); + (*wbty == WordBoundary && w1 ^ w2) + || (*wbty == NotWordBoundary && !(w1 ^ w2)) + } + } + } +} + +/// The matching engines offered by this regex implementation. +/// +/// N.B. This is exported for use in testing. +#[doc(hidden)] +#[derive(Clone, Copy, Debug)] +pub enum MatchEngine { + /// A bounded backtracking implementation. About twice as fast as the + /// NFA, but can only work on small regexes and small input. + Backtrack, + /// A full NFA simulation. Can always be employed but almost always the + /// slowest choice. + Nfa, + /// If the entire regex is a literal and no capture groups have been + /// requested, then we can degrade to a simple substring match. + Literals, +} + +/// Program represents a compiled regular expression. Once an expression is +/// compiled, its representation is immutable and will never change. +/// (Well, almost. In fact, the matching engines cache state that can be +/// reused on subsequent searches. But this is interior mutability that +/// shouldn't be observable by the caller.) +#[derive(Debug)] +pub struct Program { + /// The original regular expression string. + pub original: String, + /// A sequence of instructions. + pub insts: Vec, + /// The sequence of capture group names. There is an entry for each capture + /// group index and a name exists only if the capture group is named. + pub cap_names: Vec>, + /// If the regular expression requires a literal prefix in order to have a + /// match, that prefix is stored here as a DFA. + pub prefixes: Prefix, + /// True iff matching any literal prefix indicates a match. + pub prefixes_complete: bool, + /// True iff program is anchored at the beginning. + pub anchored_begin: bool, + /// True iff program is anchored at the end. + pub anchored_end: bool, + /// The type of matching engine to use. + /// When `None` (the default), pick an engine automatically. + pub engine: Option, + /// Cached NFA threads. + pub nfa_threads: Pool, + /// Cached backtracking memory. + pub backtrack: Pool, +} + +impl Program { + /// Compiles a Regex. + pub fn new( + engine: Option, + size_limit: usize, + re: &str, + ) -> Result { + let expr = try!(syntax::Expr::parse(re)); + let (insts, cap_names) = try!(Compiler::new(size_limit).compile(expr)); + let (insts_len, ncaps) = (insts.len(), num_captures(&insts)); + let create_threads = move || NfaThreads::new(insts_len, ncaps); + let create_backtrack = move || BackMachine::new(); + let mut prog = Program { + original: re.into(), + insts: insts, + cap_names: cap_names, + prefixes: Prefix::Empty, + prefixes_complete: false, + anchored_begin: false, + anchored_end: false, + engine: engine, + nfa_threads: Pool::new(Box::new(create_threads)), + backtrack: Pool::new(Box::new(create_backtrack)), + }; + + prog.find_prefixes(); + prog.anchored_begin = match prog.insts[1] { + Inst::EmptyLook(LookInst::StartText) => true, + _ => false, + }; + prog.anchored_end = match prog.insts[prog.insts.len() - 3] { + Inst::EmptyLook(LookInst::EndText) => true, + _ => false, + }; + Ok(prog) + } + + /// Executes a compiled regex program. + pub fn exec( + &self, + caps: &mut CaptureIdxs, + text: &str, + start: usize, + ) -> bool { + match self.choose_engine(caps.len(), text) { + MatchEngine::Backtrack => Backtrack::exec(self, caps, text, start), + MatchEngine::Nfa => Nfa::exec(self, caps, text, start), + MatchEngine::Literals => { + match self.prefixes.find(&text[start..]) { + None => false, + Some((s, e)) => { + if caps.len() == 2 { + caps[0] = Some(start + s); + caps[1] = Some(start + e); + } + true + } + } + } + } + } + + fn choose_engine(&self, cap_len: usize, text: &str) -> MatchEngine { + // If the engine is already chosen, then we use it. + // But that might not be a good idea. e.g., What if `Literals` is + // chosen and it can't work? I guess we should probably check whether + // the chosen engine is appropriate or not. + self.engine.unwrap_or_else(|| { + if cap_len <= 2 + && self.prefixes.len() == 1 + && self.prefixes_complete { + // We can only use this when the regex is entirely a literal + // (not an alternation of literals). + // The reason (for now) is that the prefix DFA doesn't handle + // priority the same way the regex engine does. + // e.g., given `ab|a`, the prefix DFA would report `a` as a + // match in the string `ab`, when in fact, `ab` should match. + // + // But, we can still get major winnings by avoiding the + // matching engine for a single literal. + // + // I guess we could teach Aho-Corasick about priority, but we + // might as well just implement a full DFA. + MatchEngine::Literals + } else if Backtrack::should_exec(self, text) { + // We're only here if the input and regex combined are small. + MatchEngine::Backtrack + } else { + MatchEngine::Nfa + } + }) + } + + /// Returns the total number of capture groups in the regular expression. + /// This includes the zeroth capture. + pub fn num_captures(&self) -> usize { + num_captures(&self.insts) + } + + /// Allocate new capture groups. + pub fn alloc_captures(&self) -> Vec> { + vec![None; 2 * self.num_captures()] + } + + /// Find and store a prefix machine for the current program. + pub fn find_prefixes(&mut self) { + use self::Inst::*; + + let (ps, complete) = self.prefixes_from_insts(1); + if ps.len() > 0 { + self.prefixes = Prefix::new(ps); + self.prefixes_complete = complete; + return; + } + let mut pc = 1; + let mut prefixes = vec![]; + let mut pcomplete = true; + while let Split(x, y) = self.insts[pc] { + let (xps, xcomplete) = self.prefixes_from_insts(x); + let (yps, ycomplete) = self.prefixes_from_insts(y); + let mut done = false; + match (&self.insts[x], &self.insts[y]) { + // We should be able to support this. Add explicit stack. ---AG + (&Split(_, _), &Split(_, _)) => return, + (_, &Split(_, _)) if xps.len() == 0 => return, + (_, &Split(_, _)) => { + pcomplete = pcomplete && xcomplete; + prefixes.extend(xps); + pc = y; + } + (&Split(_, _), _) if yps.len() == 0 => return, + (&Split(_, _), _) => { + pcomplete = pcomplete && ycomplete; + prefixes.extend(yps); + pc = x; + } + _ if xps.len() == 0 || yps.len() == 0 => return, + // This is our base case. We've followed splits the whole + // way, which means both instructions lead to a match. + _ => { + pcomplete = pcomplete && xcomplete && ycomplete; + prefixes.extend(xps); + prefixes.extend(yps); + done = true; + } + } + // Arg. We've over-extended ourselves, quit with nothing to + // show for it. + if prefixes.len() > NUM_PREFIX_LIMIT { + return; + } + if done { break; } + } + self.prefixes_complete = pcomplete; + self.prefixes = Prefix::new(prefixes); + } + + /// Find a prefix starting at the given instruction. + /// + /// Returns `true` in the tuple if the end of the prefix leads trivially + /// to a match. (This may report false negatives, but being conservative + /// is OK.) + fn prefixes_from_insts(&self, mut pc: usize) -> (Vec, bool) { + use self::Inst::*; + + let mut complete = true; + let mut alts = vec![String::new()]; + while pc < self.insts.len() { + let inst = &self.insts[pc]; + + // Each iteration adds one character to every alternate prefix *or* + // it stops. Thus, the prefix alternates grow in lock step, and it + // suffices to check one of them to see if the prefix limit has been + // exceeded. + if alts[0].len() > PREFIX_LENGTH_LIMIT { + complete = false; + break; + } + match *inst { + Save(_) => { pc += 1; continue } // completely ignore it + Char(OneChar { c, casei: false }) => { + for alt in &mut alts { + alt.push(c); + } + pc += 1; + } + Ranges(CharRanges { ref ranges, casei: false }) => { + let nchars = num_chars_in_ranges(ranges); + if alts.len() * nchars > NUM_PREFIX_LIMIT { + complete = false; + break; + } + + let orig = alts; + alts = Vec::with_capacity(orig.len()); + for &(s, e) in ranges { + for c in (s as u32)..(e as u32 + 1){ + for alt in &orig { + let mut alt = alt.clone(); + alt.push(::std::char::from_u32(c).unwrap()); + alts.push(alt); + } + } + } + pc += 1; + } + Jump(pc2) => pc = pc2, + _ => { complete = self.leads_to_match(pc); break } + } + } + if alts[0].len() == 0 { + (vec![], false) + } else { + (alts, complete) + } + } + + fn leads_to_match(&self, mut pc: usize) -> bool { + // I'm pretty sure this is conservative, so it might have some + // false negatives. + loop { + match self.insts[pc] { + Inst::Match => return true, + Inst::Save(_) => pc += 1, + Inst::Jump(pc2) => pc = pc2, + _ => return false, + } + } + } +} + +impl Clone for Program { + fn clone(&self) -> Program { + let (insts_len, ncaps) = (self.insts.len(), self.num_captures()); + let create_threads = move || NfaThreads::new(insts_len, ncaps); + let create_backtrack = move || BackMachine::new(); + Program { + original: self.original.clone(), + insts: self.insts.clone(), + cap_names: self.cap_names.clone(), + prefixes: self.prefixes.clone(), + prefixes_complete: self.prefixes_complete, + anchored_begin: self.anchored_begin, + anchored_end: self.anchored_end, + engine: self.engine, + nfa_threads: Pool::new(Box::new(create_threads)), + backtrack: Pool::new(Box::new(create_backtrack)), + } + } +} + +/// Return the number of captures in the given sequence of instructions. +fn num_captures(insts: &[Inst]) -> usize { + let mut n = 0; + for inst in insts { + match *inst { + Inst::Save(c) => n = cmp::max(n, c+1), + _ => {} + } + } + // There's exactly 2 Save slots for every capture. + n / 2 +} + +/// Count the number of characters in the given range. +/// +/// This is useful for pre-emptively limiting the number of prefix literals +/// we extract from a regex program. +fn num_chars_in_ranges(ranges: &[(char, char)]) -> usize { + ranges.iter() + .map(|&(s, e)| (e as u32) - (s as u32)) + .fold(0, |acc, len| acc + len) as usize +} diff --git a/src/re.rs b/src/re.rs index f3eb7b19b1..556ad8f83d 100644 --- a/src/re.rs +++ b/src/re.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -16,14 +16,21 @@ use std::fmt; use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; -use compile::Program; +use program::{Program, MatchEngine}; use syntax; -use vm; -use vm::CaptureLocs; -use vm::MatchKind::{self, Exists, Location, Submatches}; -use self::NamesIter::*; -use self::Regex::*; +const REPLACE_EXPAND: &'static str = r"(?x) + (?P^|\b|[^$]) # Ignore `$$name`. + \$ + (?P # Match the actual capture name. Can be... + [0-9]+ # A sequence of digits (for indexed captures), or... + | + [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. + ) +"; + +/// Type alias for representing capture indices. +pub type CaptureIdxs = [Option]; /// Escapes all regular expression meta characters in `text`. /// @@ -166,20 +173,11 @@ pub enum Regex { // See the comments for the `program` module in `lib.rs` for a more // detailed explanation for what `regex!` requires. #[doc(hidden)] - Dynamic(ExDynamic), + Dynamic(Program), #[doc(hidden)] Native(ExNative), } -#[derive(Clone)] -#[doc(hidden)] -pub struct ExDynamic { - original: String, - names: Vec>, - #[doc(hidden)] - pub prog: Program -} - #[doc(hidden)] pub struct ExNative { #[doc(hidden)] @@ -187,7 +185,7 @@ pub struct ExNative { #[doc(hidden)] pub names: &'static &'static [Option<&'static str>], #[doc(hidden)] - pub prog: fn(MatchKind, &str, usize, usize) -> Vec> + pub prog: fn(&mut CaptureIdxs, &str, usize) -> bool, } impl Copy for ExNative {} @@ -250,13 +248,29 @@ impl Regex { /// /// The default size limit used in `new` is 10MB. pub fn with_size_limit(size: usize, re: &str) -> Result { - let ast = try!(syntax::Expr::parse(re)); - let (prog, names) = try!(Program::new(ast, size)); - Ok(Dynamic(ExDynamic { - original: re.to_string(), - names: names, - prog: prog, - })) + Regex::with_engine(None, size, re) + } + + /// Compiles a dynamic regular expression and uses given matching engine. + /// + /// This is exposed for use in testing and shouldn't be used by clients. + /// Instead, the regex program should choose the correct matching engine + /// to use automatically. (Based on the regex, the size of the input and + /// the type of search.) + /// + /// A value of `None` means that the engine is automatically selected, + /// which is the default behavior. + /// + /// **WARNING**: Passing an unsuitable engine for the given regex/input + /// could lead to bad things. (Not unsafe things, but panics, incorrect + /// matches and large memory use are all things that could happen.) + #[doc(hidden)] + pub fn with_engine( + engine: Option, + size: usize, + re: &str, + ) -> Result { + Program::new(engine, size, re).map(Regex::Dynamic) } @@ -271,12 +285,11 @@ impl Regex { /// # extern crate regex; use regex::Regex; /// # fn main() { /// let text = "I categorically deny having triskaidekaphobia."; - /// let matched = Regex::new(r"\b\w{13}\b").unwrap().is_match(text); - /// assert!(matched); + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); /// # } /// ``` pub fn is_match(&self, text: &str) -> bool { - has_match(&exec(self, Exists, text)) + exec(self, &mut [], text, 0) } /// Returns the start and end byte range of the leftmost-first match in @@ -300,8 +313,8 @@ impl Regex { /// # } /// ``` pub fn find(&self, text: &str) -> Option<(usize, usize)> { - let caps = exec(self, Location, text); - if has_match(&caps) { + let mut caps = [None, None]; + if exec(self, &mut caps, text, 0) { Some((caps[0].unwrap(), caps[1].unwrap())) } else { None @@ -392,8 +405,12 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let caps = exec(self, Submatches, text); - Captures::new(self, text, caps) + let mut caps = self.alloc_captures(); + if exec(self, &mut caps, text, 0) { + Some(Captures::new(self, text, caps)) + } else { + None + } } /// Returns an iterator over all the non-overlapping capture groups matched @@ -579,17 +596,29 @@ impl Regex { let mut new = String::with_capacity(text.len()); let mut last_match = 0; - for (i, cap) in self.captures_iter(text).enumerate() { - // It'd be nicer to use the 'take' iterator instead, but it seemed - // awkward given that '0' => no limit. - if limit > 0 && i >= limit { - break + if rep.no_expand().is_some() { + // borrow checker pains. `rep` is borrowed mutably in the `else` + // branch below. + let rep = rep.no_expand().unwrap(); + for (i, (s, e)) in self.find_iter(text).enumerate() { + if limit > 0 && i >= limit { + break + } + new.push_str(&text[last_match..s]); + new.push_str(&rep); + last_match = e; + } + } else { + for (i, cap) in self.captures_iter(text).enumerate() { + if limit > 0 && i >= limit { + break + } + // unwrap on 0 is OK because captures only reports matches + let (s, e) = cap.pos(0).unwrap(); + new.push_str(&text[last_match..s]); + new.push_str(&rep.reg_replace(&cap)); + last_match = e; } - - let (s, e) = cap.pos(0).unwrap(); // captures only reports matches - new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)); - last_match = e; } new.push_str(&text[last_match..]); return new; @@ -598,31 +627,37 @@ impl Regex { /// Returns the original string of this regex. pub fn as_str<'a>(&'a self) -> &'a str { match *self { - Dynamic(ExDynamic { ref original, .. }) => original, - Native(ExNative { ref original, .. }) => original, + Regex::Dynamic(Program { ref original, .. }) => original, + Regex::Native(ExNative { ref original, .. }) => original, } } #[doc(hidden)] pub fn names_iter<'a>(&'a self) -> NamesIter<'a> { match *self { - Native(ref n) => NamesIterNative(n.names.iter()), - Dynamic(ref d) => NamesIterDynamic(d.names.iter()) + Regex::Native(ref n) => NamesIter::Native(n.names.iter()), + Regex::Dynamic(ref d) => NamesIter::Dynamic(d.cap_names.iter()) } } fn names_len(&self) -> usize { match *self { - Native(ref n) => n.names.len(), - Dynamic(ref d) => d.names.len() + Regex::Native(ref n) => n.names.len(), + Regex::Dynamic(ref d) => d.cap_names.len() } } + fn alloc_captures(&self) -> Vec> { + match *self { + Regex::Native(ref n) => vec![None; 2 * n.names.len()], + Regex::Dynamic(ref d) => d.alloc_captures(), + } + } } pub enum NamesIter<'a> { - NamesIterNative(::std::slice::Iter<'a, Option<&'static str>>), - NamesIterDynamic(::std::slice::Iter<'a, Option>) + Native(::std::slice::Iter<'a, Option<&'static str>>), + Dynamic(::std::slice::Iter<'a, Option>) } impl<'a> Iterator for NamesIter<'a> { @@ -630,8 +665,10 @@ impl<'a> Iterator for NamesIter<'a> { fn next(&mut self) -> Option> { match *self { - NamesIterNative(ref mut i) => i.next().map(|x| x.map(|s| s.to_string())), - NamesIterDynamic(ref mut i) => i.next().map(|x| x.as_ref().map(|s| s.to_string())), + NamesIter::Native(ref mut i) => + i.next().map(|x| x.map(|s| s.to_owned())), + NamesIter::Dynamic(ref mut i) => + i.next().map(|x| x.as_ref().map(|s| s.to_owned())), } } } @@ -653,24 +690,39 @@ pub trait Replacer { /// The `'a` lifetime refers to the lifetime of a borrowed string when /// a new owned string isn't needed (e.g., for `NoExpand`). fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str>; + + /// Returns a possibly owned string that never needs expansion. + fn no_expand<'a>(&'a mut self) -> Option> { None } } impl<'t> Replacer for NoExpand<'t> { fn reg_replace<'a>(&'a mut self, _: &Captures) -> Cow<'a, str> { - let NoExpand(s) = *self; - Cow::Borrowed(s) + self.0.into() + } + + fn no_expand<'a>(&'a mut self) -> Option> { + Some(self.0.into()) } } impl<'t> Replacer for &'t str { fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - Cow::Owned(caps.expand(*self)) + caps.expand(*self).into() + } + + fn no_expand<'a>(&'a mut self) -> Option> { + let re = Regex::new(REPLACE_EXPAND).unwrap(); + if !re.is_match(self) { + Some((*self).into()) + } else { + None + } } } impl Replacer for F where F: FnMut(&Captures) -> String { fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - Cow::Owned((*self)(caps)) + (*self)(caps).into() } } @@ -750,37 +802,33 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { /// `'t` is the lifetime of the matched text. pub struct Captures<'t> { text: &'t str, - locs: CaptureLocs, + locs: Vec>, named: Option>, } impl<'t> Captures<'t> { - fn new(re: &Regex, search: &'t str, locs: CaptureLocs) - -> Option> { - if !has_match(&locs) { - return None - } - + fn new( + re: &Regex, + search: &'t str, + locs: Vec>, + ) -> Captures<'t> { let named = if re.names_len() == 0 { None } else { let mut named = HashMap::new(); for (i, name) in re.names_iter().enumerate() { - match name { - None => {}, - Some(name) => { - named.insert(name, i); - } + if let Some(name) = name { + named.insert(name, i); } } Some(named) }; - Some(Captures { + Captures { text: search, locs: locs, named: named, - }) + } } /// Returns the start and end positions of the Nth capture group. @@ -856,15 +904,7 @@ impl<'t> Captures<'t> { /// To write a literal `$` use `$$`. pub fn expand(&self, text: &str) -> String { // How evil can you get? - let re = Regex::new(r"(?x) - (?P^|\b|[^$]) # Ignore `$$name`. - \$ - (?P # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) - ").unwrap(); + let re = Regex::new(REPLACE_EXPAND).unwrap(); let text = re.replace_all(text, |refs: &Captures| -> String { let before = refs.name("before").unwrap_or(""); let name = refs.name("name").unwrap_or(""); @@ -974,14 +1014,11 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { return None } - let caps = exec_slice(self.re, Submatches, self.search, - self.last_end, self.search.len()); - let (s, e) = - if !has_match(&caps) { - return None - } else { - (caps[0].unwrap(), caps[1].unwrap()) - }; + let mut caps = self.re.alloc_captures(); + if !exec(self.re, &mut caps, self.search, self.last_end) { + return None + } + let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. @@ -995,7 +1032,7 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { } self.last_end = e; self.last_match = Some(self.last_end); - Captures::new(self.re, self.search, caps) + Some(Captures::new(self.re, self.search, caps)) } } @@ -1022,14 +1059,11 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { return None } - let caps = exec_slice(self.re, Location, self.search, - self.last_end, self.search.len()); - let (s, e) = - if !has_match(&caps) { - return None - } else { - (caps[0].unwrap(), caps[1].unwrap()) - }; + let mut caps = [None, None]; + if !exec(self.re, &mut caps, self.search, self.last_end) { + return None; + } + let (s, e) = (caps[0].unwrap(), caps[1].unwrap()); // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. @@ -1106,19 +1140,9 @@ unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { } } -fn exec(re: &Regex, which: MatchKind, input: &str) -> CaptureLocs { - exec_slice(re, which, input, 0, input.len()) -} - -fn exec_slice(re: &Regex, which: MatchKind, - input: &str, s: usize, e: usize) -> CaptureLocs { +fn exec(re: &Regex, caps: &mut CaptureIdxs, text: &str, start: usize) -> bool { match *re { - Dynamic(ExDynamic { ref prog, .. }) => vm::run(which, prog, input, s, e), - Native(ExNative { ref prog, .. }) => (*prog)(which, input, s, e), + Regex::Native(ExNative { ref prog, .. }) => (*prog)(caps, text, start), + Regex::Dynamic(ref prog) => prog.exec(caps, text, start), } } - -#[inline] -fn has_match(caps: &CaptureLocs) -> bool { - caps.len() >= 2 && caps[0].is_some() && caps[1].is_some() -} diff --git a/src/vm.rs b/src/vm.rs deleted file mode 100644 index 7fcd7fded8..0000000000 --- a/src/vm.rs +++ /dev/null @@ -1,531 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// FIXME: Currently, the VM simulates an NFA. It would be nice to have another -// VM that simulates a DFA. -// -// According to Russ Cox[1], a DFA performs better than an NFA, principally -// because it reuses states previously computed by the machine *and* doesn't -// keep track of capture groups. The drawback of a DFA (aside from its -// complexity) is that it can't accurately return the locations of submatches. -// The NFA *can* do that. (This is my understanding anyway.) -// -// Cox suggests that a DFA ought to be used to answer "does this match" and -// "where does it match" questions. (In the latter, the starting position of -// the match is computed by executing the regex backwards.) Cox also suggests -// that a DFA should be run when asking "where are the submatches", which can -// 1) quickly answer "no" is there's no match and 2) discover the substring -// that matches, which means running the NFA on smaller input. -// -// Currently, the NFA simulation implemented below does some dirty tricks to -// avoid tracking capture groups when they aren't needed (which only works -// for 'is_match', not 'find'). This is a half-measure, but does provide some -// perf improvement. -// -// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go. -// -// [1] - http://swtch.com/~rsc/regex/regex3.html - -use self::MatchKind::*; -use self::StepState::*; - -use std::cmp; -use std::mem; - -use compile::Program; -use compile::Inst::*; -use syntax; - -pub type CaptureLocs = Vec>; - -/// Indicates the type of match to be performed by the VM. -#[derive(Copy, Clone)] -pub enum MatchKind { - /// Only checks if a match exists or not. Does not return location. - Exists, - /// Returns the start and end indices of the entire match in the input - /// given. - Location, - /// Returns the start and end indices of each submatch in the input given. - Submatches, -} - -/// Runs an NFA simulation on the compiled expression given on the search text -/// `input`. The search begins at byte index `start` and ends at byte index -/// `end`. (The range is specified here so that zero-width assertions will work -/// correctly when searching for successive non-overlapping matches.) -/// -/// The `which` parameter indicates what kind of capture information the caller -/// wants. There are three choices: match existence only, the location of the -/// entire match or the locations of the entire match in addition to the -/// locations of each submatch. -pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str, - start: usize, end: usize) -> CaptureLocs { - Nfa { - which: which, - prog: prog, - input: input, - start: start, - end: end, - ic: 0, - chars: CharReader::new(input), - }.run() -} - -struct Nfa<'r, 't> { - which: MatchKind, - prog: &'r Program, - input: &'t str, - start: usize, - end: usize, - ic: usize, - chars: CharReader<'t>, -} - -/// Indicates the next action to take after a single non-empty instruction -/// is processed. -#[derive(Copy, Clone)] -pub enum StepState { - /// This is returned if and only if a Match instruction is reached and - /// we only care about the existence of a match. It instructs the VM to - /// quit early. - StepMatchEarlyReturn, - /// Indicates that a match was found. Thus, the rest of the states in the - /// *current* queue should be dropped (i.e., leftmost-first semantics). - /// States in the "next" queue can still be processed. - StepMatch, - /// No match was found. Continue with the next state in the queue. - StepContinue, -} - -impl<'r, 't> Nfa<'r, 't> { - fn run(&mut self) -> CaptureLocs { - let ncaps = match self.which { - Exists => 0, - Location => 1, - Submatches => self.prog.num_captures(), - }; - let mut matched = false; - let ninsts = self.prog.insts.len(); - let mut clist = Threads::new(self.which, ninsts, ncaps); - let mut nlist = Threads::new(self.which, ninsts, ncaps); - let mut groups = vec![None; ncaps * 2]; - - // Determine if the expression starts with a '^' so we can avoid - // simulating .*? - // Make sure multi-line mode isn't enabled for it, otherwise we can't - // drop the initial .*? - let prefix_anchor = match self.prog.insts[1] { - StartText => true, - _ => false, - }; - - self.ic = self.start; - let mut next_ic = self.chars.set(self.start); - while self.ic <= self.end { - if clist.size == 0 { - // We have a match and we're done exploring alternatives. - // Time to quit. - if matched { - break - } - - // If the expression starts with a '^' we can terminate as soon - // as the last thread dies. - if self.ic != 0 && prefix_anchor { - break; - } - - // If there are no threads to try, then we'll have to start - // over at the beginning of the regex. - // BUT, if there's a literal prefix for the program, try to - // jump ahead quickly. If it can't be found, then we can bail - // out early. - if self.prog.prefix.len() > 0 { - let needle = self.prog.prefix.as_bytes(); - let haystack = &self.input.as_bytes()[self.ic..]; - match find_prefix(needle, haystack) { - None => break, - Some(i) => { - self.ic += i; - next_ic = self.chars.set(self.ic); - } - } - } - } - - // This simulates a preceding '.*?' for every regex by adding - // a state starting at the current position in the input for the - // beginning of the program only if we don't already have a match. - if clist.size == 0 || (!prefix_anchor && !matched) { - self.add(&mut clist, 0, &mut groups) - } - - // Now we try to read the next character. - // As a result, the 'step' method will look at the previous - // character. - self.ic = next_ic; - next_ic = self.chars.advance(); - - for i in 0..clist.size { - let pc = clist.pc(i); - let step_state = self.step(&mut groups, &mut nlist, - clist.groups(i), pc); - match step_state { - StepMatchEarlyReturn => return vec![Some(0), Some(0)], - StepMatch => { matched = true; break }, - StepContinue => {}, - } - } - mem::swap(&mut clist, &mut nlist); - nlist.empty(); - } - match self.which { - Exists if matched => vec![Some(0), Some(0)], - Exists => vec![None, None], - Location | Submatches => groups, - } - } - - fn step(&self, groups: &mut [Option], nlist: &mut Threads, - caps: &mut [Option], pc: usize) - -> StepState { - match self.prog.insts[pc] { - Match => { - match self.which { - Exists => { - return StepMatchEarlyReturn - } - Location => { - groups[0] = caps[0]; - groups[1] = caps[1]; - return StepMatch - } - Submatches => { - for (slot, val) in groups.iter_mut().zip(caps.iter()) { - *slot = *val; - } - return StepMatch - } - } - } - OneChar { c, casei } => { - if self.char_eq(casei, self.chars.prev, c) { - self.add(nlist, pc+1, caps); - } - } - CharClass(ref cls) => { - if self.chars.prev.map(|c| cls.matches(c)).unwrap_or(false) { - self.add(nlist, pc+1, caps); - } - } - Any => self.add(nlist, pc+1, caps), - AnyNoNL => { - if !self.char_eq(false, self.chars.prev, '\n') { - self.add(nlist, pc+1, caps) - } - } - StartLine | EndLine | StartText | EndText - | WordBoundary | NotWordBoundary - | Save(_) | Jump(_) | Split(_, _) => {}, - } - StepContinue - } - - fn add(&self, nlist: &mut Threads, pc: usize, groups: &mut [Option]) { - if nlist.contains(pc) { - return - } - // We have to add states to the threads list even if their empty. - // TL;DR - It prevents cycles. - // If we didn't care about cycles, we'd *only* add threads that - // correspond to non-jumping instructions (OneChar, Any, Match, etc.). - // But, it's possible for valid regexs (like '(a*)*') to result in - // a cycle in the instruction list. e.g., We'll keep chasing the Split - // instructions forever. - // So we add these instructions to our thread queue, but in the main - // VM loop, we look for them but simply ignore them. - // Adding them to the queue prevents them from being revisited so we - // can avoid cycles (and the inevitable stack overflow). - // - // We make a minor optimization by indicating that the state is "empty" - // so that its capture groups are not filled in. - match self.prog.insts[pc] { - StartLine => { - nlist.add(pc, groups, true); - if self.chars.is_begin() || self.char_is(self.chars.prev, '\n') { - self.add(nlist, pc + 1, groups); - } - } - StartText => { - nlist.add(pc, groups, true); - if self.chars.is_begin() { - self.add(nlist, pc + 1, groups); - } - } - EndLine => { - nlist.add(pc, groups, true); - if self.chars.is_end() || self.char_is(self.chars.cur, '\n') { - self.add(nlist, pc + 1, groups) - } - } - EndText => { - nlist.add(pc, groups, true); - if self.chars.is_end() { - self.add(nlist, pc + 1, groups) - } - } - WordBoundary => { - nlist.add(pc, groups, true); - if self.chars.is_word_boundary() { - self.add(nlist, pc + 1, groups); - } - } - NotWordBoundary => { - nlist.add(pc, groups, true); - if !self.chars.is_word_boundary() { - self.add(nlist, pc + 1, groups); - } - } - Save(slot) => { - nlist.add(pc, groups, true); - match self.which { - Location if slot <= 1 => { - let old = groups[slot]; - groups[slot] = Some(self.ic); - self.add(nlist, pc + 1, groups); - groups[slot] = old; - } - Submatches => { - let old = groups[slot]; - groups[slot] = Some(self.ic); - self.add(nlist, pc + 1, groups); - groups[slot] = old; - } - Exists | Location => self.add(nlist, pc + 1, groups), - } - } - Jump(to) => { - nlist.add(pc, groups, true); - self.add(nlist, to, groups) - } - Split(x, y) => { - nlist.add(pc, groups, true); - self.add(nlist, x, groups); - self.add(nlist, y, groups); - } - Match | OneChar{..} | CharClass(_) | Any | AnyNoNL => { - nlist.add(pc, groups, false); - } - } - } - - // Use Unicode simple case folding for case insensitive comparisons, - // as we’re matching individual code points. - #[inline] - fn char_eq(&self, casei: bool, textc: Option, regc: char) -> bool { - match textc { - None => false, - Some(textc) => { - regc == textc || (casei && syntax::simple_case_fold(regc) == syntax::simple_case_fold(textc)) - } - } - } - - #[inline] - fn char_is(&self, textc: Option, regc: char) -> bool { - textc == Some(regc) - } -} - -/// CharReader is responsible for maintaining a "previous" and a "current" -/// character. This one-character lookahead is necessary for assertions that -/// look one character before or after the current position. -pub struct CharReader<'t> { - /// The previous character read. It is None only when processing the first - /// character of the input. - pub prev: Option, - /// The current character. - pub cur: Option, - input: &'t str, - next: usize, -} - -impl<'t> CharReader<'t> { - /// Returns a new CharReader that advances through the input given. - /// Note that a CharReader has no knowledge of the range in which to search - /// the input. - pub fn new(input: &'t str) -> CharReader<'t> { - CharReader { - prev: None, - cur: None, - input: input, - next: 0, - } - } - - /// Sets the previous and current character given any arbitrary byte - /// index (at a Unicode codepoint boundary). - #[inline] - pub fn set(&mut self, ic: usize) -> usize { - self.prev = None; - self.cur = None; - self.next = 0; - - if self.input.len() == 0 { - return 1 - } - if ic > 0 { - let i = cmp::min(ic, self.input.len()); - self.prev = self.input[..i].chars().rev().next(); - } - if ic < self.input.len() { - let cur = self.input[ic..].chars().next().unwrap(); - self.cur = Some(cur); - self.next = ic + cur.len_utf8(); - self.next - } else { - self.input.len() + 1 - } - } - - /// Does the same as `set`, except it always advances to the next - /// character in the input (and therefore does half as many UTF8 decodings). - #[inline] - pub fn advance(&mut self) -> usize { - self.prev = self.cur; - if self.next < self.input.len() { - let cur = self.input[self.next..].chars().next().unwrap(); - self.cur = Some(cur); - self.next += cur.len_utf8(); - } else { - self.cur = None; - self.next = self.input.len() + 1; - } - self.next - } - - /// Returns true if and only if this is the beginning of the input - /// (ignoring the range of the input to search). - #[inline] - pub fn is_begin(&self) -> bool { self.prev.is_none() } - - /// Returns true if and only if this is the end of the input - /// (ignoring the range of the input to search). - #[inline] - pub fn is_end(&self) -> bool { self.cur.is_none() } - - /// Returns true if and only if the current position is a word boundary. - /// (Ignoring the range of the input to search.) - pub fn is_word_boundary(&self) -> bool { - fn is_word(c: Option) -> bool { - c.map(syntax::is_word_char).unwrap_or(false) - } - - if self.is_begin() { - return is_word(self.cur); - } - if self.is_end() { - return is_word(self.prev); - } - (is_word(self.cur) && !is_word(self.prev)) - || (is_word(self.prev) && !is_word(self.cur)) - } -} - -#[derive(Clone)] -struct Thread { - pc: usize, - groups: Vec>, -} - -struct Threads { - which: MatchKind, - queue: Vec, - sparse: Vec, - size: usize, -} - -impl Threads { - // This is using a wicked neat trick to provide constant time lookup - // for threads in the queue using a sparse set. A queue of threads is - // allocated once with maximal size when the VM initializes and is reused - // throughout execution. That is, there should be zero allocation during - // the execution of a VM. - // - // See http://research.swtch.com/sparse for the deets. - fn new(which: MatchKind, num_insts: usize, ncaps: usize) -> Threads { - let t = Thread { pc: 0, groups: vec![None; ncaps * 2] }; - Threads { - which: which, - queue: vec![t; num_insts], - sparse: vec![0; num_insts], - size: 0, - } - } - - fn add(&mut self, pc: usize, groups: &[Option], empty: bool) { - let t = &mut self.queue[self.size]; - t.pc = pc; - match (empty, self.which) { - (_, Exists) | (true, _) => {}, - (false, Location) => { - t.groups[0] = groups[0]; - t.groups[1] = groups[1]; - } - (false, Submatches) => { - for (slot, val) in t.groups.iter_mut().zip(groups.iter()) { - *slot = *val; - } - } - } - self.sparse[pc] = self.size; - self.size += 1; - } - - #[inline] - fn contains(&self, pc: usize) -> bool { - let s = self.sparse[pc]; - s < self.size && self.queue[s].pc == pc - } - - #[inline] - fn empty(&mut self) { - self.size = 0; - } - - #[inline] - fn pc(&self, i: usize) -> usize { - self.queue[i].pc - } - - #[inline] - fn groups(&mut self, i: usize) -> &mut [Option] { - &mut self.queue[i].groups - } -} - -/// Returns the starting location of `needle` in `haystack`. -/// If `needle` is not in `haystack`, then `None` is returned. -/// -/// Note that this is using a naive substring algorithm. -#[inline] -pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option { - let (hlen, nlen) = (haystack.len(), needle.len()); - if nlen > hlen || nlen == 0 { - return None - } - for (offset, window) in haystack.windows(nlen).enumerate() { - if window == needle { - return Some(offset) - } - } - None -}