Lexer should consider BOM for the start offset

astral-sh · Jun 4, 2024 · 3f5207c · 3f5207c
1 parent 6ffb961
commit 3f5207c
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 11 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -30,6 +30,8 @@ mod cursor;
 mod fstring;
 mod indentation;
 
+const BOM: char = '\u{feff}';
+
 /// A lexer for Python source code.
 #[derive(Debug)]
 pub struct Lexer<'src> {
@@ -102,7 +104,14 @@ impl<'src> Lexer<'src> {
 
         // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
         // spell-checker:ignore feff
-        lexer.cursor.eat_char('\u{feff}');
+        let start_offset = if lexer.cursor.eat_char(BOM) {
+            // The start offset needs to consider the BOM character if it's present.
+            start_offset
+                .checked_sub(BOM.text_len())
+                .unwrap_or(start_offset)
+        } else {
+            start_offset
+        };
 
         if start_offset > TextSize::new(0) {
             lexer.cursor.skip_bytes(start_offset.to_usize());
@@ -1918,8 +1927,8 @@ mod tests {
         }
     }
 
-    fn lex(source: &str, mode: Mode) -> LexerOutput {
-        let mut lexer = Lexer::new(source, mode, TextSize::default());
+    fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let mut lexer = Lexer::new(source, mode, start_offset);
         let mut tokens = Vec::new();
         loop {
             let kind = lexer.next_token();
@@ -1939,8 +1948,8 @@ mod tests {
         }
     }
 
-    fn lex_valid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+    fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
+        let output = lex(source, mode, start_offset);
 
         if !output.errors.is_empty() {
             let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
@@ -1955,7 +1964,7 @@ mod tests {
     }
 
     fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
-        let output = lex(source, mode);
+        let output = lex(source, mode, TextSize::default());
 
         assert!(
             !output.errors.is_empty(),
@@ -1966,11 +1975,35 @@ mod tests {
     }
 
     fn lex_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Module)
+        lex_valid(source, Mode::Module, TextSize::default())
+    }
+
+    fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
+        lex_valid(source, Mode::Module, start_offset)
     }
 
     fn lex_jupyter_source(source: &str) -> LexerOutput {
-        lex_valid(source, Mode::Ipython)
+        lex_valid(source, Mode::Ipython, TextSize::default())
+    }
+
+    #[test]
+    fn bom() {
+        let source = "\u{feff}x = 1";
+        assert_snapshot!(lex_source(source));
+    }
+
+    #[test]
+    fn bom_with_offset() {
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
+    }
+
+    #[test]
+    fn bom_with_offset_edge() {
+        // BOM offsets the first token by 3, so make sure that lexing from offset 11 (variable z)
+        // doesn't panic. Refer https://github.com/astral-sh/ruff/issues/11731
+        let source = "\u{feff}x + y + z";
+        assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
     }
 
     fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
@@ -2114,7 +2147,7 @@ foo = ,func
 def f(arg=%timeit a = b):
     pass"
             .trim();
-        let output = lex(source, Mode::Ipython);
+        let output = lex(source, Mode::Ipython, TextSize::default());
         assert!(output.errors.is_empty());
         assert_no_ipython_escape_command(&output.tokens);
     }
@@ -2347,7 +2380,7 @@ if first:
     }
 
     fn get_tokens_only(source: &str) -> Vec<TokenKind> {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
         assert!(output.errors.is_empty());
         output.tokens.into_iter().map(|token| token.kind).collect()
     }
@@ -2589,7 +2622,7 @@ f"{(lambda x:{x})}"
     }
 
     fn lex_fstring_error(source: &str) -> FStringErrorType {
-        let output = lex(source, Mode::Module);
+        let output = lex(source, Mode::Module, TextSize::default());
         match output
             .errors
             .into_iter()

diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom.snap
@@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: lex_source(source)
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "x",
+        ),
+        3..4,
+    ),
+    (
+        Equal,
+        5..6,
+    ),
+    (
+        Int(
+            1,
+        ),
+        7..8,
+    ),
+    (
+        Newline,
+        8..8,
+    ),
+]
+```
diff --git a/...s/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap b/...s/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset.snap
@@ -0,0 +1,29 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(7))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "y",
+        ),
+        7..8,
+    ),
+    (
+        Plus,
+        9..10,
+    ),
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```
diff --git a/...f_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap b/...f_python_parser/src/snapshots/ruff_python_parser__lexer__tests__bom_with_offset_edge.snap
@@ -0,0 +1,19 @@
+---
+source: crates/ruff_python_parser/src/lexer.rs
+expression: "lex_source_with_offset(source, TextSize::new(11))"
+---
+## Tokens
+```
+[
+    (
+        Name(
+            "z",
+        ),
+        11..12,
+    ),
+    (
+        Newline,
+        12..12,
+    ),
+]
+```