Switch to using thread_local regular expressions to avoid regex mutex…

… contention
Instagram · Aug 26, 2023 · 3f34a66 · 3f34a66
1 parent b28777e
commit 3f34a66
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 158 deletions.
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/libcst/Cargo.toml b/native/libcst/Cargo.toml
@@ -8,7 +8,7 @@ name = "libcst"
 version = "0.1.0"
 authors = ["LibCST Developers"]
 edition = "2018"
-rust-version = "1.53"
+rust-version = "1.70"
 
 [lib]
 name = "libcst_native"
@@ -34,14 +34,14 @@ pyo3 = { version = ">=0.17", optional = true }
 thiserror = "1.0.37"
 peg = "0.8.1"
 chic = "1.2.2"
-itertools = "0.10.5"
-once_cell = "1.16.0"
-regex = "1.7.0"
+regex = "1.9.3"
 libcst_derive = { path = "../libcst_derive" }
 
 [dev-dependencies]
-criterion = { version = "0.4.0", features = ["html_reports"] }
+criterion = { version = "0.5.1", features = ["html_reports"] }
 difference = "2.0.0"
+rayon = "1.7.0"
+itertools = "0.11.0"
 
 [[bench]]
 name = "parser_benchmark"

diff --git a/native/libcst/benches/parser_benchmark.rs b/native/libcst/benches/parser_benchmark.rs
@@ -9,9 +9,12 @@ use std::{
 };
 
 use criterion::{
-    black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion,
+    black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkId,
+    Criterion, Throughput,
 };
 use itertools::Itertools;
+use rayon::prelude::*;
+
 use libcst_native::{
     parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate,
 };
@@ -21,7 +24,7 @@ const NEWLINE: &str = "\n";
 #[cfg(windows)]
 const NEWLINE: &str = "\r\n";
 
-fn load_all_fixtures() -> String {
+fn load_all_fixtures_vec() -> Vec<String> {
     let mut path = PathBuf::from(file!());
     path.pop();
     path.pop();
@@ -42,7 +45,11 @@ fn load_all_fixtures() -> String {
             let path = file.unwrap().path();
             std::fs::read_to_string(&path).expect("reading_file")
         })
-        .join(NEWLINE)
+        .collect()
+}
+
+fn load_all_fixtures() -> String {
+    load_all_fixtures_vec().join(NEWLINE)
 }
 
 pub fn inflate_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
@@ -117,9 +124,47 @@ pub fn parse_into_cst_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
     group.finish();
 }
 
+pub fn parse_into_cst_multithreaded_benchmarks<T: Measurement + std::marker::Sync>(
+    c: &mut Criterion<T>,
+) where
+    <T as Measurement>::Value: Send,
+{
+    let fixtures = load_all_fixtures_vec();
+    let mut group = c.benchmark_group("parse_into_cst_parallel");
+    group.measurement_time(Duration::from_secs(15));
+    group.warm_up_time(Duration::from_secs(5));
+
+    for thread_count in 1..10 {
+        let expanded_fixtures = (0..thread_count)
+            .flat_map(|_| fixtures.clone())
+            .collect_vec();
+        group.throughput(Throughput::Elements(expanded_fixtures.len() as u64));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(thread_count),
+            &thread_count,
+            |b, thread_count| {
+                let thread_pool = rayon::ThreadPoolBuilder::new()
+                    .num_threads(*thread_count)
+                    .build()
+                    .unwrap();
+                thread_pool.install(|| {
+                    b.iter_with_large_drop(|| {
+                        expanded_fixtures
+                            .par_iter()
+                            .map(|contents| black_box(parse_module(&contents, None)))
+                            .collect::<Vec<_>>()
+                    });
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
 criterion_group!(
     name=benches;
     config=Criterion::default();
-    targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks
+    targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks, parse_into_cst_multithreaded_benchmarks
 );
 criterion_main!(benches);
diff --git a/native/libcst/src/parser/numbers.rs b/native/libcst/src/parser/numbers.rs
@@ -3,7 +3,6 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 
 use crate::nodes::deflated::{Expression, Float, Imaginary, Integer};
@@ -13,51 +12,48 @@ static BIN: &str = r"0[bB](?:_?[01])+";
 static OCT: &str = r"0[oO](?:_?[0-7])+";
 static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)";
 
-static INTEGER_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex")
-});
-
 static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*";
 // Note: these don't exactly match the python implementation (exponent is not included)
 static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)";
 static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*";
 
-static FLOAT_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(
-        format!(
-            "^({}({})?|{}{})$",
-            POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+thread_local! {
+    static INTEGER_RE: Regex =
+        Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex");
+    static FLOAT_RE: Regex =
+        Regex::new(
+            format!(
+                "^({}({})?|{}{})$",
+                POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+            )
+            .as_str(),
         )
-        .as_str(),
-    )
-    .expect("regex")
-});
-
-static IMAGINARY_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(
-        format!(
-            r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
-            POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+        .expect("regex");
+    static IMAGINARY_RE: Regex =
+        Regex::new(
+            format!(
+                r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
+                POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+            )
+            .as_str(),
         )
-        .as_str(),
-    )
-    .expect("regex")
-});
+        .expect("regex");
+}
 
 pub(crate) fn parse_number(raw: &str) -> Expression {
-    if INTEGER_RE.is_match(raw) {
+    if INTEGER_RE.with(|r| r.is_match(raw)) {
         Expression::Integer(Box::new(Integer {
             value: raw,
             lpar: Default::default(),
             rpar: Default::default(),
         }))
-    } else if FLOAT_RE.is_match(raw) {
+    } else if FLOAT_RE.with(|r| r.is_match(raw)) {
         Expression::Float(Box::new(Float {
             value: raw,
             lpar: Default::default(),
             rpar: Default::default(),
         }))
-    } else if IMAGINARY_RE.is_match(raw) {
+    } else if IMAGINARY_RE.with(|r| r.is_match(raw)) {
         Expression::Imaginary(Box::new(Imaginary {
             value: raw,
             lpar: Default::default(),

diff --git a/native/libcst/src/tokenizer/core/mod.rs b/native/libcst/src/tokenizer/core/mod.rs
@@ -58,7 +58,6 @@
 /// [RustPython's parser]: https://crates.io/crates/rustpython-parser
 mod string_types;
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 use std::cell::RefCell;
 use std::cmp::Ordering;
@@ -83,25 +82,27 @@ const MAX_INDENT: usize = 100;
 // https://github.com/rust-lang/rust/issues/71763
 const MAX_CHAR: char = '\u{10ffff}';
 
-static SPACE_TAB_FORMFEED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex"));
-static ANY_NON_NEWLINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex"));
-static STRING_PREFIX_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex"));
-static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex"));
-static DECIMAL_DOT_DIGIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex"));
-static DECIMAL_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex"));
-static HEXADECIMAL_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex"));
-static OCTAL_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex"));
-static BINARY_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex"));
-
-/// Used to verify identifiers when there's a non-ascii character in them.
-// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
-// given Python version's behavior.
-static UNICODE_IDENTIFIER_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex"));
+thread_local! {
+    static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
+    static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
+    static STRING_PREFIX_RE: Regex =
+        Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex");
+    static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
+        Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
+    static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
+    static DECIMAL_TAIL_RE: Regex =
+        Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
+    static HEXADECIMAL_TAIL_RE: Regex =
+        Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
+    static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
+    static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");
+
+    /// Used to verify identifiers when there's a non-ascii character in them.
+    // This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
+    // given Python version's behavior.
+    static UNICODE_IDENTIFIER_RE: Regex =
+        Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
+}
 
 #[derive(Debug, Eq, PartialEq, Copy, Clone)]
 pub enum TokType {
@@ -316,11 +317,11 @@ impl<'t> TokState<'t> {
 
         'again: loop {
             // Skip spaces
-            self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE);
+            SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));
 
             // Skip comment, unless it's a type comment
             if self.text_pos.peek() == Some('#') {
-                self.text_pos.consume(&*ANY_NON_NEWLINE_RE);
+                ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
                 // type_comment is not supported
             }
 
@@ -384,7 +385,7 @@ impl<'t> TokState<'t> {
                 }
 
                 // Number starting with period
-                Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => {
+                Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
                     self.consume_number(NumberState::Fraction)
                 }
 
@@ -472,7 +473,7 @@ impl<'t> TokState<'t> {
                 }
 
                 // Operator
-                Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op),
+                Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),
 
                 // Bad character
                 // If nothing works, fall back to this error. CPython returns an OP in this case,
@@ -623,7 +624,7 @@ impl<'t> TokState<'t> {
 
     fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
         // Process the various legal combinations of b"", r"", u"", and f"".
-        if self.text_pos.consume(&*STRING_PREFIX_RE) {
+        if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
             if let Some('"') | Some('\'') = self.text_pos.peek() {
                 // We found a string, not an identifier. Bail!
                 if self.split_fstring
@@ -645,7 +646,7 @@ impl<'t> TokState<'t> {
                 Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
             ));
         }
-        self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE);
+        POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
         let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
         if !verify_identifier(identifier_str) {
             // TODO: async/await
@@ -691,7 +692,7 @@ impl<'t> TokState<'t> {
                     match self.text_pos.peek() {
                         Some('x') | Some('X') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE)
+                            if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 Err(TokError::BadHexadecimal)
@@ -701,7 +702,7 @@ impl<'t> TokState<'t> {
                         }
                         Some('o') | Some('O') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*OCTAL_TAIL_RE)
+                            if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 return Err(TokError::BadOctal);
@@ -715,7 +716,7 @@ impl<'t> TokState<'t> {
                         }
                         Some('b') | Some('B') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*BINARY_TAIL_RE)
+                            if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 return Err(TokError::BadBinary);
@@ -819,7 +820,7 @@ impl<'t> TokState<'t> {
 
     /// Processes a decimal tail. This is the bit after the dot or after an E in a float.
     fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
-        let result = self.text_pos.consume(&*DECIMAL_TAIL_RE);
+        let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
         // Assumption: If we've been called, the first character is an integer, so we must have a
         // regex match
         debug_assert!(result, "try_decimal_tail was called on a non-digit char");
@@ -1058,7 +1059,7 @@ fn verify_identifier(name: &str) -> bool {
     // TODO: If `name` is non-ascii, must first normalize name to NFKC.
     // Common case: If the entire string is ascii, we can avoid the more expensive regex check,
     // since the tokenizer already validates ascii characters before calling us.
-    name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name)
+    name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
 }
 
 #[derive(Clone)]

diff --git a/native/libcst/src/tokenizer/operators.rs b/native/libcst/src/tokenizer/operators.rs
@@ -8,7 +8,6 @@
 // code or that we retain the original work's copyright information.
 // https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 
 /// A list of strings that make up all the possible operators in a specific version of Python.
@@ -69,7 +68,8 @@ pub const OPERATORS: &[&str] = &[
     "<>",
 ];
 
-pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
+thread_local! {
+pub static OPERATOR_RE: Regex = {
     // sort operators so that we try to match the longest ones first
     let mut sorted_operators: Box<[&str]> = OPERATORS.into();
     sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len());
@@ -82,4 +82,5 @@ pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
             .join("|")
     ))
     .expect("regex")
-});
+};
+}
diff --git a/native/libcst/src/tokenizer/text_position/mod.rs b/native/libcst/src/tokenizer/text_position/mod.rs
@@ -5,14 +5,15 @@
 
 mod char_width;
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 use std::fmt;
 
 use crate::tokenizer::debug_utils::EllipsisDebug;
 use char_width::NewlineNormalizedCharWidths;
 
-static CR_OR_LF_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex"));
+thread_local! {
+    static CR_OR_LF_RE: Regex = Regex::new(r"[\r\n]").expect("regex");
+}
 
 pub trait TextPattern {
     fn match_len(&self, text: &str) -> Option<usize>;
@@ -98,7 +99,7 @@ impl<'t> TextPosition<'t> {
         match match_len {
             Some(match_len) => {
                 assert!(
-                    !CR_OR_LF_RE.is_match(&rest_of_text[..match_len]),
+                    !CR_OR_LF_RE.with(|r| r.is_match(&rest_of_text[..match_len])),
                     "matches pattern must not match a newline",
                 );
                 true