Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to using thread_local regular expressions to stop mutext contention #996

Merged
merged 1 commit into from
Aug 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 193 additions & 65 deletions native/Cargo.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions native/libcst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ name = "libcst"
version = "0.1.0"
authors = ["LibCST Developers"]
edition = "2018"
rust-version = "1.53"
rust-version = "1.70"

[lib]
name = "libcst_native"
Expand All @@ -34,14 +34,14 @@ pyo3 = { version = ">=0.17", optional = true }
thiserror = "1.0.37"
peg = "0.8.1"
chic = "1.2.2"
itertools = "0.10.5"
once_cell = "1.16.0"
regex = "1.7.0"
regex = "1.9.3"
libcst_derive = { path = "../libcst_derive" }

[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
criterion = { version = "0.5.1", features = ["html_reports"] }
difference = "2.0.0"
rayon = "1.7.0"
itertools = "0.11.0"

[[bench]]
name = "parser_benchmark"
Expand Down
53 changes: 49 additions & 4 deletions native/libcst/benches/parser_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ use std::{
};

use criterion::{
black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion,
black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, BenchmarkId,
Criterion, Throughput,
};
use itertools::Itertools;
use rayon::prelude::*;

use libcst_native::{
parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate,
};
Expand All @@ -21,7 +24,7 @@ const NEWLINE: &str = "\n";
#[cfg(windows)]
const NEWLINE: &str = "\r\n";

fn load_all_fixtures() -> String {
fn load_all_fixtures_vec() -> Vec<String> {
let mut path = PathBuf::from(file!());
path.pop();
path.pop();
Expand All @@ -42,7 +45,11 @@ fn load_all_fixtures() -> String {
let path = file.unwrap().path();
std::fs::read_to_string(&path).expect("reading_file")
})
.join(NEWLINE)
.collect()
}

fn load_all_fixtures() -> String {
load_all_fixtures_vec().join(NEWLINE)
}

pub fn inflate_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
Expand Down Expand Up @@ -117,9 +124,47 @@ pub fn parse_into_cst_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
group.finish();
}

pub fn parse_into_cst_multithreaded_benchmarks<T: Measurement + std::marker::Sync>(
c: &mut Criterion<T>,
) where
<T as Measurement>::Value: Send,
{
let fixtures = load_all_fixtures_vec();
let mut group = c.benchmark_group("parse_into_cst_parallel");
group.measurement_time(Duration::from_secs(15));
group.warm_up_time(Duration::from_secs(5));

for thread_count in 1..10 {
let expanded_fixtures = (0..thread_count)
.flat_map(|_| fixtures.clone())
.collect_vec();
group.throughput(Throughput::Elements(expanded_fixtures.len() as u64));
group.bench_with_input(
BenchmarkId::from_parameter(thread_count),
&thread_count,
|b, thread_count| {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(*thread_count)
.build()
.unwrap();
thread_pool.install(|| {
b.iter_with_large_drop(|| {
expanded_fixtures
.par_iter()
.map(|contents| black_box(parse_module(&contents, None)))
.collect::<Vec<_>>()
});
});
},
);
}

group.finish();
}

criterion_group!(
name=benches;
config=Criterion::default();
targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks
targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks, parse_into_cst_multithreaded_benchmarks
);
criterion_main!(benches);
50 changes: 23 additions & 27 deletions native/libcst/src/parser/numbers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree

use once_cell::sync::Lazy;
use regex::Regex;

use crate::nodes::deflated::{Expression, Float, Imaginary, Integer};
Expand All @@ -13,51 +12,48 @@ static BIN: &str = r"0[bB](?:_?[01])+";
static OCT: &str = r"0[oO](?:_?[0-7])+";
static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)";

static INTEGER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex")
});

static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*";
// Note: these don't exactly match the python implementation (exponent is not included)
static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)";
static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*";

static FLOAT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
format!(
"^({}({})?|{}{})$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
thread_local! {
static INTEGER_RE: Regex =
Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex");
static FLOAT_RE: Regex =
Regex::new(
format!(
"^({}({})?|{}{})$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
)
.as_str(),
)
.as_str(),
)
.expect("regex")
});

static IMAGINARY_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
format!(
r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
.expect("regex");
static IMAGINARY_RE: Regex =
Regex::new(
format!(
r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
)
.as_str(),
)
.as_str(),
)
.expect("regex")
});
.expect("regex");
}

pub(crate) fn parse_number(raw: &str) -> Expression {
if INTEGER_RE.is_match(raw) {
if INTEGER_RE.with(|r| r.is_match(raw)) {
Expression::Integer(Box::new(Integer {
value: raw,
lpar: Default::default(),
rpar: Default::default(),
}))
} else if FLOAT_RE.is_match(raw) {
} else if FLOAT_RE.with(|r| r.is_match(raw)) {
Expression::Float(Box::new(Float {
value: raw,
lpar: Default::default(),
rpar: Default::default(),
}))
} else if IMAGINARY_RE.is_match(raw) {
} else if IMAGINARY_RE.with(|r| r.is_match(raw)) {
Expression::Imaginary(Box::new(Imaginary {
value: raw,
lpar: Default::default(),
Expand Down
63 changes: 32 additions & 31 deletions native/libcst/src/tokenizer/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
/// [RustPython's parser]: https://crates.io/crates/rustpython-parser
mod string_types;

use once_cell::sync::Lazy;
use regex::Regex;
use std::cell::RefCell;
use std::cmp::Ordering;
Expand All @@ -83,25 +82,27 @@ const MAX_INDENT: usize = 100;
// https://github.com/rust-lang/rust/issues/71763
const MAX_CHAR: char = '\u{10ffff}';

static SPACE_TAB_FORMFEED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex"));
static ANY_NON_NEWLINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex"));
static STRING_PREFIX_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex"));
static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex"));
static DECIMAL_DOT_DIGIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex"));
static DECIMAL_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex"));
static HEXADECIMAL_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex"));
static OCTAL_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex"));
static BINARY_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex"));

/// Used to verify identifiers when there's a non-ascii character in them.
// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
// given Python version's behavior.
static UNICODE_IDENTIFIER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex"));
thread_local! {
static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
static STRING_PREFIX_RE: Regex =
Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex");
static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
static DECIMAL_TAIL_RE: Regex =
Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
static HEXADECIMAL_TAIL_RE: Regex =
Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");

/// Used to verify identifiers when there's a non-ascii character in them.
// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
// given Python version's behavior.
static UNICODE_IDENTIFIER_RE: Regex =
Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
}

#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub enum TokType {
Expand Down Expand Up @@ -316,11 +317,11 @@ impl<'t> TokState<'t> {

'again: loop {
// Skip spaces
self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE);
SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));

// Skip comment, unless it's a type comment
if self.text_pos.peek() == Some('#') {
self.text_pos.consume(&*ANY_NON_NEWLINE_RE);
ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
// type_comment is not supported
}

Expand Down Expand Up @@ -384,7 +385,7 @@ impl<'t> TokState<'t> {
}

// Number starting with period
Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => {
Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
self.consume_number(NumberState::Fraction)
}

Expand Down Expand Up @@ -472,7 +473,7 @@ impl<'t> TokState<'t> {
}

// Operator
Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op),
Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),

// Bad character
// If nothing works, fall back to this error. CPython returns an OP in this case,
Expand Down Expand Up @@ -623,7 +624,7 @@ impl<'t> TokState<'t> {

fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
// Process the various legal combinations of b"", r"", u"", and f"".
if self.text_pos.consume(&*STRING_PREFIX_RE) {
if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
if let Some('"') | Some('\'') = self.text_pos.peek() {
// We found a string, not an identifier. Bail!
if self.split_fstring
Expand All @@ -645,7 +646,7 @@ impl<'t> TokState<'t> {
Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
));
}
self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE);
POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
if !verify_identifier(identifier_str) {
// TODO: async/await
Expand Down Expand Up @@ -691,7 +692,7 @@ impl<'t> TokState<'t> {
match self.text_pos.peek() {
Some('x') | Some('X') => {
self.text_pos.next();
if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE)
if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
Err(TokError::BadHexadecimal)
Expand All @@ -701,7 +702,7 @@ impl<'t> TokState<'t> {
}
Some('o') | Some('O') => {
self.text_pos.next();
if !self.text_pos.consume(&*OCTAL_TAIL_RE)
if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadOctal);
Expand All @@ -715,7 +716,7 @@ impl<'t> TokState<'t> {
}
Some('b') | Some('B') => {
self.text_pos.next();
if !self.text_pos.consume(&*BINARY_TAIL_RE)
if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadBinary);
Expand Down Expand Up @@ -819,7 +820,7 @@ impl<'t> TokState<'t> {

/// Processes a decimal tail. This is the bit after the dot or after an E in a float.
fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
let result = self.text_pos.consume(&*DECIMAL_TAIL_RE);
let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
// Assumption: If we've been called, the first character is an integer, so we must have a
// regex match
debug_assert!(result, "try_decimal_tail was called on a non-digit char");
Expand Down Expand Up @@ -1058,7 +1059,7 @@ fn verify_identifier(name: &str) -> bool {
// TODO: If `name` is non-ascii, must first normalize name to NFKC.
// Common case: If the entire string is ascii, we can avoid the more expensive regex check,
// since the tokenizer already validates ascii characters before calling us.
name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name)
name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
}

#[derive(Clone)]
Expand Down
7 changes: 4 additions & 3 deletions native/libcst/src/tokenizer/operators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
// code or that we retain the original work's copyright information.
// https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation

use once_cell::sync::Lazy;
use regex::Regex;

/// A list of strings that make up all the possible operators in a specific version of Python.
Expand Down Expand Up @@ -69,7 +68,8 @@ pub const OPERATORS: &[&str] = &[
"<>",
];

pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
thread_local! {
pub static OPERATOR_RE: Regex = {
// sort operators so that we try to match the longest ones first
let mut sorted_operators: Box<[&str]> = OPERATORS.into();
sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len());
Expand All @@ -82,4 +82,5 @@ pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
.join("|")
))
.expect("regex")
});
};
}
7 changes: 4 additions & 3 deletions native/libcst/src/tokenizer/text_position/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@

mod char_width;

use once_cell::sync::Lazy;
use regex::Regex;
use std::fmt;

use crate::tokenizer::debug_utils::EllipsisDebug;
use char_width::NewlineNormalizedCharWidths;

static CR_OR_LF_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex"));
thread_local! {
static CR_OR_LF_RE: Regex = Regex::new(r"[\r\n]").expect("regex");
}

pub trait TextPattern {
fn match_len(&self, text: &str) -> Option<usize>;
Expand Down Expand Up @@ -98,7 +99,7 @@ impl<'t> TextPosition<'t> {
match match_len {
Some(match_len) => {
assert!(
!CR_OR_LF_RE.is_match(&rest_of_text[..match_len]),
!CR_OR_LF_RE.with(|r| r.is_match(&rest_of_text[..match_len])),
"matches pattern must not match a newline",
);
true
Expand Down
Loading
Loading