From eb9cc7537ce9cc8f113c833a708cd79863fc5756 Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Tue, 29 Aug 2023 10:01:20 +0530 Subject: [PATCH] Use `bitflags` for f-string context, move it under `lexer/` --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 199 ++++-------------- .../ruff_python_parser/src/lexer/fstring.rs | 137 ++++++++++++ 4 files changed, 175 insertions(+), 163 deletions(-) create mode 100644 crates/ruff_python_parser/src/lexer/fstring.rs diff --git a/Cargo.lock b/Cargo.lock index b44f1af78a808e..9332929401d1b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2422,6 +2422,7 @@ name = "ruff_python_parser" version = "0.0.0" dependencies = [ "anyhow", + "bitflags 2.3.3", "insta", "is-macro", "itertools", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index ed5920b44b7fd2..86f4c3ffce8c6f 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -18,6 +18,7 @@ ruff_python_ast = { path = "../ruff_python_ast" } ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } +bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 59dec550f13000..dc3f44a4be24d2 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -40,6 +40,7 @@ use unic_emoji_char::is_emoji_presentation; use unic_ucd_ident::{is_xid_continue, is_xid_start}; use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::fstring::{FStringContext, FStringContextFlags}; use crate::lexer::indentation::{Indentation, Indentations}; use crate::{ soft_keywords::SoftKeywordTransformer, @@ -49,6 +50,7 @@ use crate::{ }; mod cursor; +mod fstring; mod indentation; /// A lexer for Python source code. @@ -547,20 +549,20 @@ impl<'source> Lexer<'source> { #[cfg(debug_assertions)] debug_assert_eq!(self.cursor.previous(), quote); - // If the next two characters are also the quote character, then we have a triple-quoted - // string; consume those two characters and ensure that we require a triple-quote to close - let quote_size = if self.cursor.first() == quote && self.cursor.second() == quote { + let mut flags = FStringContextFlags::empty(); + if quote == '"' { + flags |= FStringContextFlags::DOUBLE; + } + if is_raw_string { + flags |= FStringContextFlags::RAW; + } + if self.cursor.first() == quote && self.cursor.second() == quote { self.cursor.bump(); self.cursor.bump(); - StringQuoteSize::Triple - } else { - StringQuoteSize::Single + flags |= FStringContextFlags::TRIPLE; }; - // SAFETY: Safe because `quote` is either `'` or `"` - let quote_char = StringQuoteChar::try_from(quote).unwrap(); - self.fstring_stack - .push(FStringContext::new(quote_char, quote_size, is_raw_string)); + self.fstring_stack.push(FStringContext::new(flags)); Tok::FStringStart } @@ -570,18 +572,13 @@ impl<'source> Lexer<'source> { let context = self.fstring_stack.last().unwrap(); // Check if we're at the end of the f-string. - match context.quote_size { - StringQuoteSize::Single => { - if self.cursor.eat_char(context.quote_char.as_char()) { - return Ok(Some(Tok::FStringEnd)); - } - } - StringQuoteSize::Triple => { - let quote_char = context.quote_char.as_char(); - if self.cursor.eat_char3(quote_char, quote_char, quote_char) { - return Ok(Some(Tok::FStringEnd)); - } + if context.is_triple_quoted() { + let quote_char = context.quote_char(); + if self.cursor.eat_char3(quote_char, quote_char, quote_char) { + return Ok(Some(Tok::FStringEnd)); } + } else if self.cursor.eat_char(context.quote_char()) { + return Ok(Some(Tok::FStringEnd)); } // The normalized string if the token value is not yet normalized. @@ -597,7 +594,7 @@ impl<'source> Lexer<'source> { loop { match self.cursor.first() { EOF_CHAR => { - let error = if context.allow_multiline() { + let error = if context.is_triple_quoted() { FStringErrorType::UnterminatedTripleQuotedString } else { FStringErrorType::UnterminatedString @@ -610,7 +607,7 @@ impl<'source> Lexer<'source> { location: self.offset(), }); } - '\n' if !context.allow_multiline() => { + '\n' if !context.is_triple_quoted() => { // This is to avoid infinite loop where the lexer keeps returning // the error token. self.fstring_stack.pop(); @@ -624,7 +621,7 @@ impl<'source> Lexer<'source> { if matches!(self.cursor.first(), '{' | '}') { // Don't consume `{` or `}` as we want them to be consumed as tokens. break; - } else if !context.is_raw_string { + } else if !context.is_raw_string() { if self.cursor.eat_char2('N', '{') { in_named_unicode = true; continue; @@ -633,20 +630,15 @@ impl<'source> Lexer<'source> { // Consume the escaped character. self.cursor.bump(); } - quote @ ('\'' | '"') if quote == context.quote_char.as_char() => { - match context.quote_size { - StringQuoteSize::Single => break, - StringQuoteSize::Triple => { - let triple_quote = match context.quote_char { - StringQuoteChar::Single => "'''", - StringQuoteChar::Double => r#"""""#, - }; - if self.cursor.rest().starts_with(triple_quote) { - break; - } + quote @ ('\'' | '"') if quote == context.quote_char() => { + if let Some(triple_quotes) = context.triple_quotes() { + if self.cursor.rest().starts_with(triple_quotes) { + break; } + self.cursor.bump(); + } else { + break; } - self.cursor.bump(); } '{' => { if self.cursor.second() == '{' { @@ -692,7 +684,7 @@ impl<'source> Lexer<'source> { }; Ok(Some(Tok::FStringMiddle { value, - is_raw: context.is_raw_string, + is_raw: context.is_raw_string(), })) } @@ -716,22 +708,13 @@ impl<'source> Lexer<'source> { // When we are in an f-string, check whether does the initial quote // matches with f-strings quotes and if it is, then this must be a // missing '}' token so raise the proper error. - if fstring_context.quote_char.as_char() == quote { - match fstring_context.quote_size { - StringQuoteSize::Single if !triple_quoted => { - return Err(LexicalError { - error: LexicalErrorType::FStringError(FStringErrorType::UnclosedLbrace), - location: self.offset() - TextSize::new(1), - }); - } - StringQuoteSize::Triple if triple_quoted => { - return Err(LexicalError { - error: LexicalErrorType::FStringError(FStringErrorType::UnclosedLbrace), - location: self.offset() - TextSize::new(3), - }); - } - _ => {} - } + if fstring_context.quote_char() == quote + && fstring_context.is_triple_quoted() == triple_quoted + { + return Err(LexicalError { + error: LexicalErrorType::FStringError(FStringErrorType::UnclosedLbrace), + location: self.offset() - fstring_context.quote_size(), + }); } } @@ -1387,116 +1370,6 @@ impl std::fmt::Display for LexicalErrorType { } } -#[derive(Copy, Clone, Debug)] -#[cfg(feature = "pep-701")] -enum StringQuoteChar { - Single, - Double, -} - -#[cfg(feature = "pep-701")] -impl TryFrom for StringQuoteChar { - type Error = String; - - fn try_from(c: char) -> Result { - match c { - '\'' => Ok(Self::Single), - '"' => Ok(Self::Double), - _ => Err(format!("Invalid string quote character: {c}")), - } - } -} - -#[cfg(feature = "pep-701")] -impl StringQuoteChar { - const fn as_char(self) -> char { - match self { - Self::Single => '\'', - Self::Double => '"', - } - } -} - -#[derive(Copy, Clone, Debug)] -#[cfg(feature = "pep-701")] -enum StringQuoteSize { - Single, - Triple, -} - -#[derive(Debug)] -#[cfg(feature = "pep-701")] -struct FStringContext { - quote_char: StringQuoteChar, - quote_size: StringQuoteSize, - parentheses_count: u32, - format_spec_count: u32, - is_raw_string: bool, -} - -#[cfg(feature = "pep-701")] -impl FStringContext { - fn new(quote_char: StringQuoteChar, quote_size: StringQuoteSize, is_raw_string: bool) -> Self { - Self { - quote_char, - quote_size, - parentheses_count: 0, - format_spec_count: 0, - is_raw_string, - } - } - - /// Returns `true` if the current f-string allows multiline i.e., a triple-quoted f-string. - const fn allow_multiline(&self) -> bool { - matches!(self.quote_size, StringQuoteSize::Triple) - } - - /// Returns `true` if the current f-string has open parentheses. - fn has_open_parentheses(&mut self) -> bool { - self.parentheses_count > 0 - } - - /// Increments the number of parentheses for the current f-string. - fn increment_opening_parentheses(&mut self) { - self.parentheses_count += 1; - } - - /// Decrements the number of parentheses for the current f-string. If we're - /// in a format spec, also decrements the number of format specs. - fn decrement_closing_parentheses(&mut self) { - if self.is_in_format_spec() { - self.format_spec_count = self.format_spec_count.saturating_sub(1); - } - self.parentheses_count = self.parentheses_count.saturating_sub(1); - } - - /// Returns `true` if the lexer is in a f-string expression i.e., between two curly braces. - fn is_in_expression(&self) -> bool { - self.parentheses_count > self.format_spec_count - } - - /// Returns `true` if the lexer is in a f-string format spec i.e., after a colon. - fn is_in_format_spec(&self) -> bool { - self.format_spec_count > 0 && !self.is_in_expression() - } - - /// Returns `true` if the colon (`:`) for the current f-string is in a valid - /// position i.e., at the same level of nesting as the opening parentheses token. - /// Increments the number of format specs if it is. - fn try_start_format_spec(&mut self) -> bool { - if self - .parentheses_count - .saturating_sub(self.format_spec_count) - == 1 - { - self.format_spec_count += 1; - true - } else { - false - } - } -} - #[derive(Copy, Clone, Debug)] enum State { /// Lexer is right at the beginning of the file or after a `Newline` token. diff --git a/crates/ruff_python_parser/src/lexer/fstring.rs b/crates/ruff_python_parser/src/lexer/fstring.rs new file mode 100644 index 00000000000000..dd7d988df676a4 --- /dev/null +++ b/crates/ruff_python_parser/src/lexer/fstring.rs @@ -0,0 +1,137 @@ +use bitflags::bitflags; + +use ruff_text_size::TextSize; + +bitflags! { + #[derive(Debug)] + #[cfg(feature = "pep-701")] + pub(crate) struct FStringContextFlags: u32 { + /// The current f-string is a triple-quoted f-string i.e., the number of + /// opening and closing quotes is 3. If this flag is not set, the number + /// of opening and closing quotes is 1. + const TRIPLE = 1 << 0; + + /// The current f-string is a double-quoted f-string. If this flag is not + /// set, the current f-string is a single-quoted f-string. + const DOUBLE = 1 << 1; + + /// The current f-string is a raw f-string. If this flag is not set, the + /// current f-string is a non-raw f-string. + const RAW = 1 << 2; + } +} + +#[derive(Debug)] +#[cfg(feature = "pep-701")] +pub(crate) struct FStringContext { + flags: FStringContextFlags, + /// The number of open parentheses for the current f-string. This includes all + /// three types of parentheses: round (`(`), square (`[`), and curly (`{`). + open_parentheses_count: u32, + /// The number of format specs for the current f-string. This is because there + /// can be multiple format specs nested. For example, `{a:{b:{c}}}` has 3 format + /// specs. + format_spec_depth: u32, +} + +#[cfg(feature = "pep-701")] +impl FStringContext { + pub(crate) fn new(flags: FStringContextFlags) -> Self { + Self { + flags, + open_parentheses_count: 0, + format_spec_depth: 0, + } + } + + /// Returns the quote character for the current f-string. + pub(crate) fn quote_char(&self) -> char { + if self.flags.contains(FStringContextFlags::DOUBLE) { + '"' + } else { + '\'' + } + } + + /// Returns the number of quotes for the current f-string. + pub(crate) fn quote_size(&self) -> TextSize { + if self.is_triple_quoted() { + TextSize::from(3) + } else { + TextSize::from(1) + } + } + + /// Returns the triple quotes for the current f-string if it is a triple-quoted + /// f-string, `None` otherwise. + pub(crate) fn triple_quotes(&self) -> Option<&'static str> { + if self.is_triple_quoted() { + if self.flags.contains(FStringContextFlags::DOUBLE) { + Some(r#"""""#) + } else { + Some("'''") + } + } else { + None + } + } + + /// Returns `true` if the current f-string is a raw f-string. + pub(crate) fn is_raw_string(&self) -> bool { + self.flags.contains(FStringContextFlags::RAW) + } + + /// Returns `true` if the current f-string is a triple-quoted f-string. + pub(crate) fn is_triple_quoted(&self) -> bool { + self.flags.contains(FStringContextFlags::TRIPLE) + } + + /// Returns `true` if the current f-string has open parentheses. + pub(crate) fn has_open_parentheses(&mut self) -> bool { + self.open_parentheses_count > 0 + } + + /// Increments the number of parentheses for the current f-string. + pub(crate) fn increment_opening_parentheses(&mut self) { + self.open_parentheses_count += 1; + } + + /// Decrements the number of parentheses for the current f-string. If the + /// lexer is in a format spec, also decrements the number of format specs. + pub(crate) fn decrement_closing_parentheses(&mut self) { + if self.is_in_format_spec() { + self.format_spec_depth = self.format_spec_depth.saturating_sub(1); + } + self.open_parentheses_count = self.open_parentheses_count.saturating_sub(1); + } + + /// Returns `true` if the lexer is in a f-string expression i.e., between + /// two curly braces. + pub(crate) fn is_in_expression(&self) -> bool { + self.open_parentheses_count > self.format_spec_depth + } + + /// Returns `true` if the lexer is in a f-string format spec i.e., after a colon. + pub(crate) fn is_in_format_spec(&self) -> bool { + self.format_spec_depth > 0 && !self.is_in_expression() + } + + /// Returns `true` if the context is in a valid position to start format spec + /// i.e., at the same level of nesting as the opening parentheses token. + /// Increments the number of format specs if it is. + /// + /// This assumes that the current character for the lexer is a colon (`:`). + pub(crate) fn try_start_format_spec(&mut self) -> bool { + if self + .open_parentheses_count + .saturating_sub(self.format_spec_depth) + == 1 + { + self.format_spec_depth += 1; + true + } else { + false + } + } +} +