From b0a62f365f43cf7522714bbbb06e44c63d1ac869 Mon Sep 17 00:00:00 2001 From: Lukas Kalbertodt Date: Wed, 9 Jun 2021 14:47:28 +0200 Subject: [PATCH] Add conversion of `\r\n` to `\n` in *raw* (byte) string literals --- CHANGELOG.md | 2 +- src/bytestr/mod.rs | 4 +-- src/bytestr/tests.rs | 9 +++++++ src/escape.rs | 60 ++++++++++++++++++++++++++++++++++---------- src/string/mod.rs | 4 +-- src/string/tests.rs | 9 +++++++ 6 files changed, 70 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47aa553..f927589 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] -### Chaned +### Changed - Fixed (byte) string literal parsing by: - Correctly handling "string continue" sequences - Correctly converting `\n\r` into `\n` diff --git a/src/bytestr/mod.rs b/src/bytestr/mod.rs index 04047c5..6cfb61d 100644 --- a/src/bytestr/mod.rs +++ b/src/bytestr/mod.rs @@ -73,10 +73,10 @@ impl ByteStringLit { /// Precondition: input has to start with either `b"` or `br`. pub(crate) fn parse_impl(input: B) -> Result { if input.starts_with(r"br") { - let num_hashes = scan_raw_string::(&input, 2)?; + let (value, num_hashes) = scan_raw_string::(&input, 2)?; Ok(Self { raw: input, - value: None, + value: value.map(|s| s.into_bytes()), num_hashes: Some(num_hashes), }) } else { diff --git a/src/bytestr/tests.rs b/src/bytestr/tests.rs index 3029b2c..8e3c033 100644 --- a/src/bytestr/tests.rs +++ b/src/bytestr/tests.rs @@ -107,6 +107,15 @@ fn crlf_newlines() { let lit = ByteStringLit::parse("b\"foo\r\n\"").expect("failed to parse"); assert_eq!(lit.value(), b"foo\n"); + + let lit = ByteStringLit::parse("br\"foo\r\nbar\"").expect("failed to parse"); + assert_eq!(lit.value(), b"foo\nbar"); + + let lit = ByteStringLit::parse("br#\"\r\nbar\"#").expect("failed to parse"); + assert_eq!(lit.value(), b"\nbar"); + + let lit = ByteStringLit::parse("br##\"foo\r\n\"##").expect("failed to parse"); + assert_eq!(lit.value(), b"foo\n"); } #[test] diff --git a/src/escape.rs b/src/escape.rs index c406c09..dfc8420 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -162,7 +162,7 @@ pub(crate) fn unescape_string( return Err(perr(None, UnterminatedString)); } - // `value` is only empty there was no escape in the input string + // `value` is only empty if there was no escape in the input string // (with the special case of the input being empty). This means the // string value basically equals the input, so we store `None`. let value = if value.is_empty() { @@ -177,12 +177,13 @@ pub(crate) fn unescape_string( Ok(value) } -/// Reads and checks a raw (byte) string literal. Returns the number of hashes -/// used by the literal. +/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to +/// just `\n` sequences. Returns an optional new string (if the input contained +/// any `\r\n`) and the number of hashes used by the literal. pub(crate) fn scan_raw_string( input: &str, offset: usize, -) -> Result { +) -> Result<(Option, u32), ParseError> { // Raw string literal let num_hashes = input[offset..].bytes().position(|b| b != b'#') .ok_or(perr(None, InvalidLiteral))?; @@ -194,22 +195,43 @@ pub(crate) fn scan_raw_string( let hashes = &input[offset..num_hashes + offset]; let mut closing_quote_pos = None; - for (i, b) in input[start_inner..].bytes().enumerate() { - if b == b'"' && input[start_inner + i + 1..].starts_with(hashes) { - closing_quote_pos = Some(i + start_inner); + let mut i = start_inner; + let mut end_last_escape = start_inner; + let mut value = String::new(); + while i < input.len() { + let b = input.as_bytes()[i]; + if b == b'"' && input[i + 1..].starts_with(hashes) { + closing_quote_pos = Some(i); break; } - if E::SUPPORTS_UNICODE { - if b == b'\r' && input.as_bytes().get(start_inner + i + 1) != Some(&b'\n') { - return Err(perr(i + start_inner, IsolatedCr)); + if b == b'\r' { + // Convert `\r\n` into `\n`. This is currently not well documented + // in the Rust reference, but is done even for raw strings. That's + // because rustc simply converts all line endings when reading + // source files. + if input.as_bytes().get(i + 1) == Some(&b'\n') { + value.push_str(&input[end_last_escape..i]); + value.push('\n'); + i += 2; + end_last_escape = i; + continue; + } else if E::SUPPORTS_UNICODE { + // If no \n follows the \r and we are scanning a raw string + // (not raw byte string), we error. + return Err(perr(i, IsolatedCr)) } - } else { + } + + if !E::SUPPORTS_UNICODE { if !b.is_ascii() { - return Err(perr(i + start_inner, NonAsciiInByteLiteral)); + return Err(perr(i, NonAsciiInByteLiteral)); } } + + i += 1; } + let closing_quote_pos = closing_quote_pos .ok_or(perr(None, UnterminatedRawString))?; @@ -217,5 +239,17 @@ pub(crate) fn scan_raw_string( return Err(perr(closing_quote_pos + num_hashes + 1..input.len(), UnexpectedChar)); } - Ok(num_hashes as u32) + // `value` is only empty if there was no \r\n in the input string (with the + // special case of the input being empty). This means the string value + // equals the input, so we store `None`. + let value = if value.is_empty() { + None + } else { + // There was an \r\n in the string, so we need to push the remaining + // unescaped part of the string still. + value.push_str(&input[end_last_escape..closing_quote_pos]); + Some(value) + }; + + Ok((value, num_hashes as u32)) } diff --git a/src/string/mod.rs b/src/string/mod.rs index 1c28857..a21f7a7 100644 --- a/src/string/mod.rs +++ b/src/string/mod.rs @@ -70,10 +70,10 @@ impl StringLit { /// Precondition: input has to start with either `"` or `r`. pub(crate) fn parse_impl(input: B) -> Result { if input.starts_with('r') { - let num_hashes = scan_raw_string::(&input, 1)?; + let (value, num_hashes) = scan_raw_string::(&input, 1)?; Ok(Self { raw: input, - value: None, + value, num_hashes: Some(num_hashes), }) } else { diff --git a/src/string/tests.rs b/src/string/tests.rs index fb16a15..8d8882e 100644 --- a/src/string/tests.rs +++ b/src/string/tests.rs @@ -141,6 +141,15 @@ fn crlf_newlines() { let lit = StringLit::parse("\"лиса\r\n\"").expect("failed to parse"); assert_eq!(lit.value(), "лиса\n"); + + let lit = StringLit::parse("r\"foo\r\nbar\"").expect("failed to parse"); + assert_eq!(lit.value(), "foo\nbar"); + + let lit = StringLit::parse("r#\"\r\nbar\"#").expect("failed to parse"); + assert_eq!(lit.value(), "\nbar"); + + let lit = StringLit::parse("r##\"лиса\r\n\"##").expect("failed to parse"); + assert_eq!(lit.value(), "лиса\n"); } #[test]