Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tokens newtype wrapper, TokenKind iterator #11361

Merged
merged 2 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions crates/ruff_linter/src/importer/insertion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,6 @@ mod tests {

use ruff_python_ast::PySourceType;
use ruff_python_codegen::Stylist;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::{parse_suite, Mode};
use ruff_source_file::{LineEnding, Locator};
use ruff_text_size::TextSize;
Expand All @@ -332,7 +331,7 @@ mod tests {
fn start_of_file() -> Result<()> {
fn insert(contents: &str) -> Result<Insertion> {
let program = parse_suite(contents)?;
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, Mode::Module);
let tokens = ruff_python_parser::tokenize(contents, Mode::Module);
let locator = Locator::new(contents);
let stylist = Stylist::from_tokens(&tokens, &locator);
Ok(Insertion::start_of_file(&program, &locator, &stylist))
Expand Down Expand Up @@ -443,7 +442,7 @@ x = 1
#[test]
fn start_of_block() {
fn insert(contents: &str, offset: TextSize) -> Insertion {
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, Mode::Module);
let tokens = ruff_python_parser::tokenize(contents, Mode::Module);
let locator = Locator::new(contents);
let stylist = Stylist::from_tokens(&tokens, &locator);
Insertion::start_of_block(offset, &locator, &stylist, PySourceType::default())
Expand Down
21 changes: 16 additions & 5 deletions crates/ruff_linter/src/linter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use ruff_python_ast::{PySourceType, Suite};
use ruff_python_codegen::Stylist;
use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::{AsMode, ParseError};
use ruff_python_parser::{AsMode, ParseError, TokenKindIter, Tokens};
use ruff_source_file::{Locator, SourceFileBuilder};
use ruff_text_size::Ranged;

Expand Down Expand Up @@ -353,7 +353,7 @@ pub fn add_noqa_to_path(
let contents = source_kind.source_code();

// Tokenize once.
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());

// Map row and column locations to byte slices (lazily).
let locator = Locator::new(contents);
Expand Down Expand Up @@ -518,8 +518,7 @@ pub fn lint_fix<'a>(
// Continuously fix until the source code stabilizes.
loop {
// Tokenize once.
let tokens: Vec<LexResult> =
ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());
let tokens = ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());

// Map row and column locations to byte slices (lazily).
let locator = Locator::new(transformed.source_code());
Expand Down Expand Up @@ -715,14 +714,26 @@ impl<'a> ParseSource<'a> {
#[derive(Debug, Clone)]
pub enum TokenSource<'a> {
/// Use the precomputed tokens to generate the AST.
Tokens(Vec<LexResult>),
Tokens(Tokens),
/// Use the precomputed tokens and AST.
Precomputed {
tokens: &'a [LexResult],
ast: &'a Suite,
},
}

impl TokenSource<'_> {
/// Returns an iterator over the [`TokenKind`] and the corresponding range.
///
/// [`TokenKind`]: ruff_python_parser::TokenKind
pub fn kinds(&self) -> TokenKindIter {
match self {
TokenSource::Tokens(tokens) => tokens.kinds(),
TokenSource::Precomputed { tokens, .. } => TokenKindIter::new(tokens),
}
}
}

impl Deref for TokenSource<'_> {
type Target = [LexResult];

Expand Down
3 changes: 1 addition & 2 deletions crates/ruff_linter/src/rules/pyflakes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ mod tests {

use anyhow::Result;
use regex::Regex;
use ruff_python_parser::lexer::LexResult;

use test_case::test_case;

Expand Down Expand Up @@ -591,7 +590,7 @@ mod tests {
let source_type = PySourceType::default();
let source_kind = SourceKind::Python(contents.to_string());
let settings = LinterSettings::for_rules(Linter::Pyflakes.rules());
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(&contents, source_type.as_mode());
let tokens = ruff_python_parser::tokenize(&contents, source_type.as_mode());
let locator = Locator::new(&contents);
let stylist = Stylist::from_tokens(&tokens, &locator);
let indexer = Indexer::from_tokens(&tokens, &locator);
Expand Down
6 changes: 2 additions & 4 deletions crates/ruff_linter/src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use ruff_notebook::NotebookError;
use ruff_python_ast::PySourceType;
use ruff_python_codegen::Stylist;
use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::AsMode;
use ruff_python_trivia::textwrap::dedent;
use ruff_source_file::{Locator, SourceFileBuilder};
Expand Down Expand Up @@ -111,8 +110,7 @@ pub(crate) fn test_contents<'a>(
settings: &LinterSettings,
) -> (Vec<Message>, Cow<'a, SourceKind>) {
let source_type = PySourceType::from(path);
let tokens: Vec<LexResult> =
ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
let tokens = ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
let locator = Locator::new(source_kind.source_code());
let stylist = Stylist::from_tokens(&tokens, &locator);
let indexer = Indexer::from_tokens(&tokens, &locator);
Expand Down Expand Up @@ -177,7 +175,7 @@ pub(crate) fn test_contents<'a>(

transformed = Cow::Owned(transformed.updated(fixed_contents, &source_map));

let tokens: Vec<LexResult> =
let tokens =
ruff_python_parser::tokenize(transformed.source_code(), source_type.as_mode());
let locator = Locator::new(transformed.source_code());
let stylist = Stylist::from_tokens(&tokens, &locator);
Expand Down
118 changes: 113 additions & 5 deletions crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,17 @@
//! [parsing]: https://en.wikipedia.org/wiki/Parsing
//! [lexer]: crate::lexer

use std::iter::FusedIterator;
use std::ops::Deref;

use crate::lexer::{lex, lex_starts_at, LexResult};

pub use crate::error::{FStringErrorType, ParseError, ParseErrorType};
pub use crate::parser::Program;
pub use crate::token::{Tok, TokenKind};

use ruff_python_ast::{Expr, Mod, ModModule, PySourceType, Suite};
use ruff_text_size::TextSize;
use ruff_text_size::{Ranged, TextRange, TextSize};

mod error;
pub mod lexer;
Expand Down Expand Up @@ -339,8 +342,113 @@ pub fn parse_tokens(tokens: Vec<LexResult>, source: &str, mode: Mode) -> Result<
}
}

/// Tokens represents a vector of [`LexResult`].
///
/// This should only include tokens up to and including the first error. This struct is created
/// by the [`tokenize`] function.
#[derive(Debug, Clone)]
pub struct Tokens(Vec<LexResult>);

impl Tokens {
/// Returns an iterator over the [`TokenKind`] and the range corresponding to the tokens.
pub fn kinds(&self) -> TokenKindIter {
TokenKindIter::new(&self.0)
}

/// Returns an iterator over the [`TokenKind`] and its range for all the tokens that are
/// within the given `range`.
///
/// The start and end position of the given range should correspond to the start position of
/// the first token and the end position of the last token in the returned iterator.
///
/// For example, if the struct contains the following tokens:
/// ```txt
/// (Def, 0..3)
/// (Name, 4..7)
/// (Lpar, 7..8)
/// (Rpar, 8..9)
/// (Colon, 9..10)
/// (Ellipsis, 11..14)
/// (Newline, 14..14)
/// ```
///
/// Then, the range `4..10` returns an iterator which yields `Name`, `Lpar`, `Rpar`, and
/// `Colon` token. But, if the given position doesn't match any of the tokens, an empty
/// iterator is returned.
pub fn kinds_within_range<T: Ranged>(&self, ranged: T) -> TokenKindIter {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, this isn't currently being used anywhere. This would basically replace the usages of lex_starts_at but it turns out all of the usages of the function is being done in the AST checker where we don't have access to the token stream.

One solution here would be to store a TokenKinds struct which contains Vec<(TokenKind, TextRange)> on the Checker. This way the rules which utilizes the lex_starts_at or lex function can still get the tokens.

Another would be to club this change with the parser. I'm leaning more towards this.

let Ok(start_index) = self.binary_search_by_key(&ranged.start(), |result| match result {
Ok((_, range)) => range.start(),
Err(error) => error.location().start(),
}) else {
return TokenKindIter::default();
};

let Ok(end_index) = self.binary_search_by_key(&ranged.end(), |result| match result {
Ok((_, range)) => range.end(),
Err(error) => error.location().end(),
}) else {
return TokenKindIter::default();
};

TokenKindIter::new(self.get(start_index..=end_index).unwrap_or(&[]))
}

/// Consumes the [`Tokens`], returning the underlying vector of [`LexResult`].
pub fn into_inner(self) -> Vec<LexResult> {
self.0
}
}

impl Deref for Tokens {
type Target = [LexResult];

fn deref(&self) -> &Self::Target {
&self.0
}
}

/// An iterator over the [`TokenKind`] and the corresponding range.
///
/// This struct is created by the [`Tokens::kinds`] method.
#[derive(Clone, Default)]
pub struct TokenKindIter<'a> {
inner: std::iter::Flatten<std::slice::Iter<'a, LexResult>>,
}

impl<'a> TokenKindIter<'a> {
/// Create a new iterator from a slice of [`LexResult`].
pub fn new(tokens: &'a [LexResult]) -> Self {
Self {
inner: tokens.iter().flatten(),
}
}

/// Return the next value without advancing the iterator.
pub fn peek(&mut self) -> Option<(TokenKind, TextRange)> {
self.clone().next()
}
}

impl Iterator for TokenKindIter<'_> {
type Item = (TokenKind, TextRange);

fn next(&mut self) -> Option<Self::Item> {
let &(ref tok, range) = self.inner.next()?;
Some((TokenKind::from_token(tok), range))
}
}

impl FusedIterator for TokenKindIter<'_> {}

impl DoubleEndedIterator for TokenKindIter<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
let &(ref tok, range) = self.inner.next_back()?;
Some((TokenKind::from_token(tok), range))
}
}

/// Collect tokens up to and including the first error.
pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
pub fn tokenize(contents: &str, mode: Mode) -> Tokens {
let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
for tok in lexer::lex(contents, mode) {
let is_err = tok.is_err();
Expand All @@ -350,7 +458,7 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
}
}

tokens
Tokens(tokens)
}

/// Tokenizes all tokens.
Expand Down Expand Up @@ -380,7 +488,7 @@ fn approximate_tokens_lower_bound(contents: &str) -> usize {

/// Parse a full Python program from its tokens.
pub fn parse_program_tokens(
tokens: Vec<LexResult>,
tokens: Tokens,
source: &str,
is_jupyter_notebook: bool,
) -> anyhow::Result<Suite, ParseError> {
Expand All @@ -389,7 +497,7 @@ pub fn parse_program_tokens(
} else {
Mode::Module
};
match parse_tokens(tokens, source, mode)? {
match parse_tokens(tokens.into_inner(), source, mode)? {
Mod::Module(m) => Ok(m.body),
Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
}
Expand Down
5 changes: 5 additions & 0 deletions crates/ruff_python_parser/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@ pub enum Tok {
}

impl Tok {
#[inline]
pub fn kind(&self) -> TokenKind {
TokenKind::from_token(self)
}

pub fn start_marker(mode: Mode) -> Self {
match mode {
Mode::Module | Mode::Ipython => Tok::StartModule,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::{tokenize, Mode};
use ruff_source_file::Locator;
use ruff_text_size::TextSize;
Expand Down Expand Up @@ -38,7 +37,7 @@ fn block_comments_indented_block() {
fn block_comments_single_line_is_not_a_block() {
// arrange
let source = "\n";
let tokens: Vec<LexResult> = tokenize(source, Mode::Module);
let tokens = tokenize(source, Mode::Module);
let locator = Locator::new(source);
let indexer = Indexer::from_tokens(&tokens, &locator);

Expand Down
3 changes: 1 addition & 2 deletions crates/ruff_server/src/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use ruff_linter::{
use ruff_python_ast::PySourceType;
use ruff_python_codegen::Stylist;
use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::AsMode;
use ruff_source_file::Locator;
use ruff_text_size::Ranged;
Expand Down Expand Up @@ -76,7 +75,7 @@ pub(crate) fn check(
let source_kind = SourceKind::Python(contents.to_string());

// Tokenize once.
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());

// Map row and column locations to byte slices (lazily).
let locator = Locator::with_index(contents, index);
Expand Down
3 changes: 1 addition & 2 deletions crates/ruff_wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use ruff_python_ast::{Mod, PySourceType};
use ruff_python_codegen::Stylist;
use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle};
use ruff_python_index::{CommentRangesBuilder, Indexer};
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode, Program};
use ruff_python_trivia::CommentRanges;
use ruff_source_file::{Locator, SourceLocation};
Expand Down Expand Up @@ -162,7 +161,7 @@ impl Workspace {
let source_kind = SourceKind::Python(contents.to_string());

// Tokenize once.
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
let tokens = ruff_python_parser::tokenize(contents, source_type.as_mode());

// Map row and column locations to byte slices (lazily).
let locator = Locator::new(contents);
Expand Down
Loading