Skip to content

Commit

Permalink
Lexer should consider BOM for the start offset
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvmanila committed Jun 4, 2024
1 parent 6ffb961 commit 3f5207c
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 11 deletions.
55 changes: 44 additions & 11 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ mod cursor;
mod fstring;
mod indentation;

const BOM: char = '\u{feff}';

/// A lexer for Python source code.
#[derive(Debug)]
pub struct Lexer<'src> {
Expand Down Expand Up @@ -102,7 +104,14 @@ impl<'src> Lexer<'src> {

// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
// spell-checker:ignore feff
lexer.cursor.eat_char('\u{feff}');
let start_offset = if lexer.cursor.eat_char(BOM) {
// The start offset needs to consider the BOM character if it's present.
start_offset
.checked_sub(BOM.text_len())
.unwrap_or(start_offset)
} else {
start_offset
};

if start_offset > TextSize::new(0) {
lexer.cursor.skip_bytes(start_offset.to_usize());
Expand Down Expand Up @@ -1918,8 +1927,8 @@ mod tests {
}
}

fn lex(source: &str, mode: Mode) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, TextSize::default());
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, start_offset);
let mut tokens = Vec::new();
loop {
let kind = lexer.next_token();
Expand All @@ -1939,8 +1948,8 @@ mod tests {
}
}

fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let output = lex(source, mode, start_offset);

if !output.errors.is_empty() {
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
Expand All @@ -1955,7 +1964,7 @@ mod tests {
}

fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode);
let output = lex(source, mode, TextSize::default());

assert!(
!output.errors.is_empty(),
Expand All @@ -1966,11 +1975,35 @@ mod tests {
}

fn lex_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Module)
lex_valid(source, Mode::Module, TextSize::default())
}

fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
lex_valid(source, Mode::Module, start_offset)
}

fn lex_jupyter_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Ipython)
lex_valid(source, Mode::Ipython, TextSize::default())
}

#[test]
fn bom() {
let source = "\u{feff}x = 1";
assert_snapshot!(lex_source(source));
}

#[test]
fn bom_with_offset() {
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
}

#[test]
fn bom_with_offset_edge() {
// BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
// doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
}

fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
Expand Down Expand Up @@ -2114,7 +2147,7 @@ foo = ,func
def f(arg=%timeit a = b):
pass"
.trim();
let output = lex(source, Mode::Ipython);
let output = lex(source, Mode::Ipython, TextSize::default());
assert!(output.errors.is_empty());
assert_no_ipython_escape_command(&output.tokens);
}
Expand Down Expand Up @@ -2347,7 +2380,7 @@ if first:
}

fn get_tokens_only(source: &str) -> Vec<TokenKind> {
let output = lex(source, Mode::Module);
let output = lex(source, Mode::Module, TextSize::default());
assert!(output.errors.is_empty());
output.tokens.into_iter().map(|token| token.kind).collect()
}
Expand Down Expand Up @@ -2589,7 +2622,7 @@ f"{(lambda x:{x})}"
}

fn lex_fstring_error(source: &str) -> FStringErrorType {
let output = lex(source, Mode::Module);
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: lex_source(source)
---
## Tokens
```
[
(
Name(
"x",
),
3..4,
),
(
Equal,
5..6,
),
(
Int(
1,
),
7..8,
),
(
Newline,
8..8,
),
]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: "lex_source_with_offset(source, TextSize::new(7))"
---
## Tokens
```
[
(
Name(
"y",
),
7..8,
),
(
Plus,
9..10,
),
(
Name(
"z",
),
11..12,
),
(
Newline,
12..12,
),
]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
source: crates/ruff_python_parser/src/lexer.rs
expression: "lex_source_with_offset(source, TextSize::new(11))"
---
## Tokens
```
[
(
Name(
"z",
),
11..12,
),
(
Newline,
12..12,
),
]
```

0 comments on commit 3f5207c

Please sign in to comment.