diff --git a/native/Cargo.lock b/native/Cargo.lock index 181ec5b0e..06a718fdf 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.18" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +checksum = "6748e8def348ed4d14996fa801f4122cd763fff530258cdc03f64b25f89d3a5a" dependencies = [ "memchr", ] @@ -24,15 +24,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7021ce4924a3f25f802b2cccd1af585e39ea1a363a1aa2e72afe54b67a3a7a7" [[package]] -name = "atty" -version = "0.2.14" +name = "anstyle" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] +checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" [[package]] name = "autocfg" @@ -46,6 +41,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" + [[package]] name = "bumpalo" version = "3.12.0" @@ -58,6 +59,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -102,40 +112,44 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.22" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +checksum = "1d5f1946157a96594eb2d2c10eb7ad9a2b27518cb3000209dec700c35df9197d" dependencies = [ - "bitflags", - "clap_lex", - "indexmap", - "textwrap", + "clap_builder", ] [[package]] -name = "clap_lex" -version = "0.2.4" +name = "clap_builder" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +checksum = "78116e32a042dd73c2901f0dc30790d20ff3447f3e3472fad359e8c3d282bcd6" dependencies = [ - "os_str_bytes", + "anstyle", + "clap_lex", ] +[[package]] +name = "clap_lex" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" + [[package]] name = "criterion" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ "anes", - "atty", "cast", "ciborium", "clap", "criterion-plot", - "itertools", - "lazy_static", + "is-terminal", + "itertools 0.10.5", "num-traits", + "once_cell", "oorandom", "plotters", "rayon", @@ -154,7 +168,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -214,6 +228,27 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "errno" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "glob" version = "0.3.0" @@ -226,12 +261,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hermit-abi" version = "0.1.19" @@ -242,14 +271,10 @@ dependencies = [ ] [[package]] -name = "indexmap" -version = "1.9.1" +name = "hermit-abi" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" -dependencies = [ - "autocfg", - "hashbrown", -] +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" [[package]] name = "indoc" @@ -266,6 +291,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi 0.3.2", + "rustix", + "windows-sys", +] + [[package]] name = "itertools" version = "0.10.5" @@ -275,6 +311,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.2" @@ -309,12 +354,12 @@ dependencies = [ "chic", "criterion", "difference", - "itertools", + "itertools 0.11.0", "libcst_derive", - "once_cell", "paste", "peg", "pyo3", + "rayon", "regex", "thiserror", ] @@ -328,6 +373,12 @@ dependencies = [ "trybuild", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" + [[package]] name = "lock_api" version = "0.4.7" @@ -377,7 +428,7 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", ] @@ -393,12 +444,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "os_str_bytes" -version = "6.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" - [[package]] name = "parking_lot" version = "0.11.2" @@ -565,21 +610,19 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.3" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ - "autocfg", - "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" -version = "1.9.3" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -593,14 +636,26 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.7.0" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" +checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ "aho-corasick", "memchr", @@ -609,9 +664,22 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.27" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" + +[[package]] +name = "rustix" +version = "0.38.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bfe0f2582b4931a45d1fa608f8a8722e8b3c7ac54dd6d5f3b3212791fedef49" +dependencies = [ + "bitflags 2.4.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] [[package]] name = "ryu" @@ -697,12 +765,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "textwrap" -version = "0.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" - [[package]] name = "thiserror" version = "1.0.37" @@ -874,3 +936,69 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/native/libcst/Cargo.toml b/native/libcst/Cargo.toml index 4f0b848aa..6c842a392 100644 --- a/native/libcst/Cargo.toml +++ b/native/libcst/Cargo.toml @@ -8,7 +8,7 @@ name = "libcst" version = "0.1.0" authors = ["LibCST Developers"] edition = "2018" -rust-version = "1.53" +rust-version = "1.70" [lib] name = "libcst_native" @@ -34,14 +34,14 @@ pyo3 = { version = ">=0.17", optional = true } thiserror = "1.0.37" peg = "0.8.1" chic = "1.2.2" -itertools = "0.10.5" -once_cell = "1.16.0" -regex = "1.7.0" +regex = "1.9.3" libcst_derive = { path = "../libcst_derive" } [dev-dependencies] -criterion = { version = "0.4.0", features = ["html_reports"] } +criterion = { version = "0.5.1", features = ["html_reports"] } difference = "2.0.0" +rayon = "1.7.0" +itertools = "0.11.0" [[bench]] name = "parser_benchmark" diff --git a/native/libcst/benches/parser_benchmark.rs b/native/libcst/benches/parser_benchmark.rs index 0b9849a47..4987022ab 100644 --- a/native/libcst/benches/parser_benchmark.rs +++ b/native/libcst/benches/parser_benchmark.rs @@ -9,9 +9,12 @@ use std::{ }; use criterion::{ - black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion, + black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkId, + Criterion, Throughput, }; use itertools::Itertools; +use rayon::prelude::*; + use libcst_native::{ parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate, }; @@ -21,7 +24,7 @@ const NEWLINE: &str = "\n"; #[cfg(windows)] const NEWLINE: &str = "\r\n"; -fn load_all_fixtures() -> String { +fn load_all_fixtures_vec() -> Vec { let mut path = PathBuf::from(file!()); path.pop(); path.pop(); @@ -42,7 +45,11 @@ fn load_all_fixtures() -> String { let path = file.unwrap().path(); std::fs::read_to_string(&path).expect("reading_file") }) - .join(NEWLINE) + .collect() +} + +fn load_all_fixtures() -> String { + load_all_fixtures_vec().join(NEWLINE) } pub fn inflate_benchmarks(c: &mut Criterion) { @@ -117,9 +124,47 @@ pub fn parse_into_cst_benchmarks(c: &mut Criterion) { group.finish(); } +pub fn parse_into_cst_multithreaded_benchmarks( + c: &mut Criterion, +) where + ::Value: Send, +{ + let fixtures = load_all_fixtures_vec(); + let mut group = c.benchmark_group("parse_into_cst_parallel"); + group.measurement_time(Duration::from_secs(15)); + group.warm_up_time(Duration::from_secs(5)); + + for thread_count in 1..10 { + let expanded_fixtures = (0..thread_count) + .flat_map(|_| fixtures.clone()) + .collect_vec(); + group.throughput(Throughput::Elements(expanded_fixtures.len() as u64)); + group.bench_with_input( + BenchmarkId::from_parameter(thread_count), + &thread_count, + |b, thread_count| { + let thread_pool = rayon::ThreadPoolBuilder::new() + .num_threads(*thread_count) + .build() + .unwrap(); + thread_pool.install(|| { + b.iter_with_large_drop(|| { + expanded_fixtures + .par_iter() + .map(|contents| black_box(parse_module(&contents, None))) + .collect::>() + }); + }); + }, + ); + } + + group.finish(); +} + criterion_group!( name=benches; config=Criterion::default(); - targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks + targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks, parse_into_cst_multithreaded_benchmarks ); criterion_main!(benches); diff --git a/native/libcst/src/parser/numbers.rs b/native/libcst/src/parser/numbers.rs index 6d7a0d8e7..95db532bf 100644 --- a/native/libcst/src/parser/numbers.rs +++ b/native/libcst/src/parser/numbers.rs @@ -3,7 +3,6 @@ // This source code is licensed under the MIT license found in the // LICENSE file in the root directory of this source tree -use once_cell::sync::Lazy; use regex::Regex; use crate::nodes::deflated::{Expression, Float, Imaginary, Integer}; @@ -13,51 +12,48 @@ static BIN: &str = r"0[bB](?:_?[01])+"; static OCT: &str = r"0[oO](?:_?[0-7])+"; static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"; -static INTEGER_RE: Lazy = Lazy::new(|| { - Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex") -}); - static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*"; // Note: these don't exactly match the python implementation (exponent is not included) static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)"; static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*"; -static FLOAT_RE: Lazy = Lazy::new(|| { - Regex::new( - format!( - "^({}({})?|{}{})$", - POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT +thread_local! { + static INTEGER_RE: Regex = + Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex"); + static FLOAT_RE: Regex = + Regex::new( + format!( + "^({}({})?|{}{})$", + POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT + ) + .as_str(), ) - .as_str(), - ) - .expect("regex") -}); - -static IMAGINARY_RE: Lazy = Lazy::new(|| { - Regex::new( - format!( - r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$", - POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT + .expect("regex"); + static IMAGINARY_RE: Regex = + Regex::new( + format!( + r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$", + POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT + ) + .as_str(), ) - .as_str(), - ) - .expect("regex") -}); + .expect("regex"); +} pub(crate) fn parse_number(raw: &str) -> Expression { - if INTEGER_RE.is_match(raw) { + if INTEGER_RE.with(|r| r.is_match(raw)) { Expression::Integer(Box::new(Integer { value: raw, lpar: Default::default(), rpar: Default::default(), })) - } else if FLOAT_RE.is_match(raw) { + } else if FLOAT_RE.with(|r| r.is_match(raw)) { Expression::Float(Box::new(Float { value: raw, lpar: Default::default(), rpar: Default::default(), })) - } else if IMAGINARY_RE.is_match(raw) { + } else if IMAGINARY_RE.with(|r| r.is_match(raw)) { Expression::Imaginary(Box::new(Imaginary { value: raw, lpar: Default::default(), diff --git a/native/libcst/src/tokenizer/core/mod.rs b/native/libcst/src/tokenizer/core/mod.rs index 359451a3f..2365eaa36 100644 --- a/native/libcst/src/tokenizer/core/mod.rs +++ b/native/libcst/src/tokenizer/core/mod.rs @@ -58,7 +58,6 @@ /// [RustPython's parser]: https://crates.io/crates/rustpython-parser mod string_types; -use once_cell::sync::Lazy; use regex::Regex; use std::cell::RefCell; use std::cmp::Ordering; @@ -83,25 +82,27 @@ const MAX_INDENT: usize = 100; // https://github.com/rust-lang/rust/issues/71763 const MAX_CHAR: char = '\u{10ffff}'; -static SPACE_TAB_FORMFEED_RE: Lazy = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex")); -static ANY_NON_NEWLINE_RE: Lazy = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex")); -static STRING_PREFIX_RE: Lazy = - Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex")); -static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy = - Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex")); -static DECIMAL_DOT_DIGIT_RE: Lazy = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex")); -static DECIMAL_TAIL_RE: Lazy = - Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex")); -static HEXADECIMAL_TAIL_RE: Lazy = - Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex")); -static OCTAL_TAIL_RE: Lazy = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex")); -static BINARY_TAIL_RE: Lazy = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex")); - -/// Used to verify identifiers when there's a non-ascii character in them. -// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a -// given Python version's behavior. -static UNICODE_IDENTIFIER_RE: Lazy = - Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex")); +thread_local! { + static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex"); + static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex"); + static STRING_PREFIX_RE: Regex = + Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex"); + static POTENTIAL_IDENTIFIER_TAIL_RE: Regex = + Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex"); + static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex"); + static DECIMAL_TAIL_RE: Regex = + Regex::new(r"\A[0-9](_?[0-9])*").expect("regex"); + static HEXADECIMAL_TAIL_RE: Regex = + Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex"); + static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex"); + static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex"); + + /// Used to verify identifiers when there's a non-ascii character in them. + // This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a + // given Python version's behavior. + static UNICODE_IDENTIFIER_RE: Regex = + Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex"); +} #[derive(Debug, Eq, PartialEq, Copy, Clone)] pub enum TokType { @@ -316,11 +317,11 @@ impl<'t> TokState<'t> { 'again: loop { // Skip spaces - self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE); + SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v)); // Skip comment, unless it's a type comment if self.text_pos.peek() == Some('#') { - self.text_pos.consume(&*ANY_NON_NEWLINE_RE); + ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v)); // type_comment is not supported } @@ -384,7 +385,7 @@ impl<'t> TokState<'t> { } // Number starting with period - Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => { + Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => { self.consume_number(NumberState::Fraction) } @@ -472,7 +473,7 @@ impl<'t> TokState<'t> { } // Operator - Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op), + Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op), // Bad character // If nothing works, fall back to this error. CPython returns an OP in this case, @@ -623,7 +624,7 @@ impl<'t> TokState<'t> { fn consume_identifier_or_prefixed_string(&mut self) -> Result> { // Process the various legal combinations of b"", r"", u"", and f"". - if self.text_pos.consume(&*STRING_PREFIX_RE) { + if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) { if let Some('"') | Some('\'') = self.text_pos.peek() { // We found a string, not an identifier. Bail! if self.split_fstring @@ -645,7 +646,7 @@ impl<'t> TokState<'t> { Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) )); } - self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE); + POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r)); let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos); if !verify_identifier(identifier_str) { // TODO: async/await @@ -691,7 +692,7 @@ impl<'t> TokState<'t> { match self.text_pos.peek() { Some('x') | Some('X') => { self.text_pos.next(); - if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE) + if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r)) || self.text_pos.peek() == Some('_') { Err(TokError::BadHexadecimal) @@ -701,7 +702,7 @@ impl<'t> TokState<'t> { } Some('o') | Some('O') => { self.text_pos.next(); - if !self.text_pos.consume(&*OCTAL_TAIL_RE) + if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r)) || self.text_pos.peek() == Some('_') { return Err(TokError::BadOctal); @@ -715,7 +716,7 @@ impl<'t> TokState<'t> { } Some('b') | Some('B') => { self.text_pos.next(); - if !self.text_pos.consume(&*BINARY_TAIL_RE) + if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r)) || self.text_pos.peek() == Some('_') { return Err(TokError::BadBinary); @@ -819,7 +820,7 @@ impl<'t> TokState<'t> { /// Processes a decimal tail. This is the bit after the dot or after an E in a float. fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> { - let result = self.text_pos.consume(&*DECIMAL_TAIL_RE); + let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r)); // Assumption: If we've been called, the first character is an integer, so we must have a // regex match debug_assert!(result, "try_decimal_tail was called on a non-digit char"); @@ -1058,7 +1059,7 @@ fn verify_identifier(name: &str) -> bool { // TODO: If `name` is non-ascii, must first normalize name to NFKC. // Common case: If the entire string is ascii, we can avoid the more expensive regex check, // since the tokenizer already validates ascii characters before calling us. - name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name) + name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name)) } #[derive(Clone)] diff --git a/native/libcst/src/tokenizer/operators.rs b/native/libcst/src/tokenizer/operators.rs index e5ef1526f..3252f774c 100644 --- a/native/libcst/src/tokenizer/operators.rs +++ b/native/libcst/src/tokenizer/operators.rs @@ -8,7 +8,6 @@ // code or that we retain the original work's copyright information. // https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation -use once_cell::sync::Lazy; use regex::Regex; /// A list of strings that make up all the possible operators in a specific version of Python. @@ -69,7 +68,8 @@ pub const OPERATORS: &[&str] = &[ "<>", ]; -pub static OPERATOR_RE: Lazy = Lazy::new(|| { +thread_local! { +pub static OPERATOR_RE: Regex = { // sort operators so that we try to match the longest ones first let mut sorted_operators: Box<[&str]> = OPERATORS.into(); sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len()); @@ -82,4 +82,5 @@ pub static OPERATOR_RE: Lazy = Lazy::new(|| { .join("|") )) .expect("regex") -}); +}; +} diff --git a/native/libcst/src/tokenizer/text_position/mod.rs b/native/libcst/src/tokenizer/text_position/mod.rs index 2e58600a2..fece9e3d3 100644 --- a/native/libcst/src/tokenizer/text_position/mod.rs +++ b/native/libcst/src/tokenizer/text_position/mod.rs @@ -5,14 +5,15 @@ mod char_width; -use once_cell::sync::Lazy; use regex::Regex; use std::fmt; use crate::tokenizer::debug_utils::EllipsisDebug; use char_width::NewlineNormalizedCharWidths; -static CR_OR_LF_RE: Lazy = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex")); +thread_local! { + static CR_OR_LF_RE: Regex = Regex::new(r"[\r\n]").expect("regex"); +} pub trait TextPattern { fn match_len(&self, text: &str) -> Option; @@ -98,7 +99,7 @@ impl<'t> TextPosition<'t> { match match_len { Some(match_len) => { assert!( - !CR_OR_LF_RE.is_match(&rest_of_text[..match_len]), + !CR_OR_LF_RE.with(|r| r.is_match(&rest_of_text[..match_len])), "matches pattern must not match a newline", ); true diff --git a/native/libcst/src/tokenizer/whitespace_parser.rs b/native/libcst/src/tokenizer/whitespace_parser.rs index f09ce7895..be5b77528 100644 --- a/native/libcst/src/tokenizer/whitespace_parser.rs +++ b/native/libcst/src/tokenizer/whitespace_parser.rs @@ -7,7 +7,6 @@ use crate::nodes::{ Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace, SimpleWhitespace, TrailingWhitespace, }; -use once_cell::sync::Lazy; use regex::Regex; use thiserror::Error; @@ -15,10 +14,12 @@ use crate::Token; use super::TokType; -static SIMPLE_WHITESPACE_RE: Lazy = - Lazy::new(|| Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex")); -static NEWLINE_RE: Lazy = Lazy::new(|| Regex::new(r"\A(\r\n?|\n)").expect("regex")); -static COMMENT_RE: Lazy = Lazy::new(|| Regex::new(r"\A#[^\r\n]*").expect("regex")); +thread_local! { + static SIMPLE_WHITESPACE_RE: Regex = Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex"); +static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex"); +static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex"); +static NEWLINE_RE_2: Regex = Regex::new(r"\r\n?|\n").expect("regex"); +} #[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)] #[derive(Error, Debug, PartialEq, Eq)] @@ -73,11 +74,8 @@ impl<'a> Config<'a> { break; } } - let default_newline = Regex::new(r"\r\n?|\n") - .expect("regex") - .find(input) - .map(|m| m.as_str()) - .unwrap_or("\n"); + let default_newline = + NEWLINE_RE_2.with(|r| r.find(input).map(|m| m.as_str()).unwrap_or("\n")); Self { input, @@ -200,9 +198,8 @@ pub fn parse_empty_lines<'a>( } pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result>> { - if let Some(comment_match) = - COMMENT_RE.find(config.get_line_after_column(state.line, state.column_byte)?) - { + let newline_after = config.get_line_after_column(state.line, state.column_byte)?; + if let Some(comment_match) = COMMENT_RE.with(|r| r.find(newline_after)) { let comment_str = comment_match.as_str(); advance_this_line( config, @@ -216,9 +213,8 @@ pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result(config: &Config<'a>, state: &mut State) -> Result>> { - if let Some(newline_match) = - NEWLINE_RE.find(config.get_line_after_column(state.line, state.column_byte)?) - { + let newline_after = config.get_line_after_column(state.line, state.column_byte)?; + if let Some(newline_match) = NEWLINE_RE.with(|r| r.find(newline_after)) { let newline_str = newline_match.as_str(); advance_this_line( config, @@ -350,10 +346,11 @@ pub fn parse_simple_whitespace<'a>( let capture_ws = |line, col| -> Result<&'a str> { let x = config.get_line_after_column(line, col); let x = x?; - Ok(SIMPLE_WHITESPACE_RE - .find(x) - .expect("SIMPLE_WHITESPACE_RE supports 0-length matches, so it must always match") - .as_str()) + Ok(SIMPLE_WHITESPACE_RE.with(|r| { + r.find(x) + .expect("SIMPLE_WHITESPACE_RE supports 0-length matches, so it must always match") + .as_str() + })) }; let start_offset = state.byte_offset; let mut prev_line: &str;