refactor(parser): Update lexer methods to handle only ASCII identifiers

Adjusted the methods `consume_ident_sequence` in the biome_css_parser's lexer modules, to return an additional boolean indicating whether only ASCII characters were used. This change will improve handling of non-ASCII identifiers in CSS, providing a more accurate classification of identifiers.
biomejs · Jan 15, 2024 · aca8cb4 · aca8cb4
1 parent 80bb3d7
commit aca8cb4
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 13 deletions.
diff --git a/crates/biome_css_parser/src/lexer/mod.rs b/crates/biome_css_parser/src/lexer/mod.rs
@@ -818,9 +818,13 @@ impl<'src> CssLexer<'src> {
         // Note to keep the buffer large enough to fit every possible keyword that
         // the lexer can return
         let mut buf = [0u8; 22];
-        let count = self.consume_ident_sequence(&mut buf);
+        let (count, only_ascii_used) = self.consume_ident_sequence(&mut buf);
 
-        match buf[..count].to_ascii_lowercase().as_slice() {
+        if !only_ascii_used {
+            return IDENT;
+        }
+
+        match &buf[..count] {
             b"media" => MEDIA_KW,
             b"keyframes" => KEYFRAMES_KW,
             b"-webkit-keyframes" => KEYFRAMES_KW,
@@ -1008,30 +1012,61 @@ impl<'src> CssLexer<'src> {
         }
     }
 
-    /// Consume a ident sequence.
-    fn consume_ident_sequence(&mut self, buf: &mut [u8]) -> usize {
+    /// Consumes a sequence of identifier characters from a byte stream, appending
+    /// them to the provided buffer in lowercase ASCII form.
+    ///
+    /// This function iteratively processes bytes from the stream, which are part
+    /// of an identifier, and appends their lowercase ASCII representation to the buffer.
+    /// It stops processing either when the buffer is full or when a non-identifier
+    /// character is encountered.
+    ///
+    /// # Arguments
+    ///
+    /// * `buf` - A mutable reference to a byte array where the identifier characters
+    ///           will be appended. This buffer should be pre-allocated and have enough
+    ///           space to hold the expected identifier.
+    ///
+    /// # Returns
+    ///
+    /// A tuple containing:
+    ///
+    /// * The number of bytes appended to the buffer (`usize`).
+    /// * A boolean indicating whether only ASCII characters were used (`true` if so).
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the first character to be consumed is not a valid
+    /// start of an identifier, as determined by `self.is_ident_start()`.
+    fn consume_ident_sequence(&mut self, buf: &mut [u8]) -> (usize, bool) {
         debug_assert!(self.is_ident_start());
 
         let mut idx = 0;
         let mut is_first = true;
+        let mut only_ascii_used = true;
         // Repeatedly consume the next input code point from the stream.
         while let Some(current) = self.current_byte() {
             if let Some(part) = self.consume_ident_part(current, is_first) {
                 is_first = false;
 
-                // In this context, "+ 4" represents a safety measure for the buffer size.
-                // It ensures that there are at least 4 slots available in the buffer.
-                // This is necessary because the maximum UTF-8 sequence length is 4 bytes.
-                if let Some(buf) = buf.get_mut(idx..idx + 4) {
-                    let res = part.encode_utf8(buf);
-                    idx += res.len();
+                if only_ascii_used && !part.is_ascii() {
+                    only_ascii_used = false;
+                }
+
+                if only_ascii_used {
+                    // Ensure that there is space in the buffer.
+                    // Since we're only dealing with ASCII, we need at most 1 byte.
+                    if let Some(buf) = buf.get_mut(idx..idx + 1) {
+                        // Convert the ASCII character to lowercase.
+                        buf[0] = part.to_ascii_lowercase() as u8;
+                        idx += 1;
+                    }
                 }
             } else {
                 break;
             }
         }
 
-        idx
+        (idx, only_ascii_used)
     }
 
     /// Tries to consume a character that forms part of a CSS identifier.

diff --git a/crates/biome_js_parser/src/lexer/mod.rs b/crates/biome_js_parser/src/lexer/mod.rs
@@ -637,7 +637,7 @@ impl<'src> JsLexer<'src> {
             // but it may cause a panic for other crates which just consume the diagnostics
             let invalid = self.current_char_unchecked();
             let err = ParseDiagnostic::new(  "expected hex digits for a unicode code point escape, but encountered an invalid character",
-                self.position..self.position + invalid.len_utf8() );
+                                             self.position..self.position + invalid.len_utf8() );
             self.diagnostics.push(err);
             self.position -= 1;
             return Err(());
@@ -1902,7 +1902,7 @@ impl<'src> JsLexer<'src> {
                                 self.resolve_identifier(chr)
                             } else {
                                 let err = ParseDiagnostic::new(  "unexpected unicode escape",
-                                    start..self.position).with_hint("this escape is unexpected, as it does not designate the start of an identifier");
+                                                                 start..self.position).with_hint("this escape is unexpected, as it does not designate the start of an identifier");
                                 self.diagnostics.push(err);
                                 self.next_byte();
                                 JsSyntaxKind::ERROR_TOKEN