From b0a62f365f43cf7522714bbbb06e44c63d1ac869 Mon Sep 17 00:00:00 2001
From: Lukas Kalbertodt <lukas.kalbertodt@gmail.com>
Date: Wed, 9 Jun 2021 14:47:28 +0200
Subject: [PATCH] Add conversion of `\r\n` to `\n` in *raw* (byte) string
 literals

---
 CHANGELOG.md         |  2 +-
 src/bytestr/mod.rs   |  4 +--
 src/bytestr/tests.rs |  9 +++++++
 src/escape.rs        | 60 ++++++++++++++++++++++++++++++++++----------
 src/string/mod.rs    |  4 +--
 src/string/tests.rs  |  9 +++++++
 6 files changed, 70 insertions(+), 18 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47aa553..f927589 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
 
 
 ## [Unreleased]
-### Chaned
+### Changed
 - Fixed (byte) string literal parsing by:
     - Correctly handling "string continue" sequences
     - Correctly converting `\n\r` into `\n`
diff --git a/src/bytestr/mod.rs b/src/bytestr/mod.rs
index 04047c5..6cfb61d 100644
--- a/src/bytestr/mod.rs
+++ b/src/bytestr/mod.rs
@@ -73,10 +73,10 @@ impl<B: Buffer> ByteStringLit<B> {
     /// Precondition: input has to start with either `b"` or `br`.
     pub(crate) fn parse_impl(input: B) -> Result<Self, ParseError> {
         if input.starts_with(r"br") {
-            let num_hashes = scan_raw_string::<u8>(&input, 2)?;
+            let (value, num_hashes) = scan_raw_string::<u8>(&input, 2)?;
             Ok(Self {
                 raw: input,
-                value: None,
+                value: value.map(|s| s.into_bytes()),
                 num_hashes: Some(num_hashes),
             })
         } else {
diff --git a/src/bytestr/tests.rs b/src/bytestr/tests.rs
index 3029b2c..8e3c033 100644
--- a/src/bytestr/tests.rs
+++ b/src/bytestr/tests.rs
@@ -107,6 +107,15 @@ fn crlf_newlines() {
 
     let lit = ByteStringLit::parse("b\"foo\r\n\"").expect("failed to parse");
     assert_eq!(lit.value(), b"foo\n");
+
+    let lit = ByteStringLit::parse("br\"foo\r\nbar\"").expect("failed to parse");
+    assert_eq!(lit.value(), b"foo\nbar");
+
+    let lit = ByteStringLit::parse("br#\"\r\nbar\"#").expect("failed to parse");
+    assert_eq!(lit.value(), b"\nbar");
+
+    let lit = ByteStringLit::parse("br##\"foo\r\n\"##").expect("failed to parse");
+    assert_eq!(lit.value(), b"foo\n");
 }
 
 #[test]
diff --git a/src/escape.rs b/src/escape.rs
index c406c09..dfc8420 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -162,7 +162,7 @@ pub(crate) fn unescape_string<E: Escapee>(
         return Err(perr(None, UnterminatedString));
     }
 
-    // `value` is only empty there was no escape in the input string
+    // `value` is only empty if there was no escape in the input string
     // (with the special case of the input being empty). This means the
     // string value basically equals the input, so we store `None`.
     let value = if value.is_empty() {
@@ -177,12 +177,13 @@ pub(crate) fn unescape_string<E: Escapee>(
     Ok(value)
 }
 
-/// Reads and checks a raw (byte) string literal. Returns the number of hashes
-/// used by the literal.
+/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
+/// just `\n` sequences. Returns an optional new string (if the input contained
+/// any `\r\n`) and the number of hashes used by the literal.
 pub(crate) fn scan_raw_string<E: Escapee>(
     input: &str,
     offset: usize,
-) -> Result<u32, ParseError> {
+) -> Result<(Option<String>, u32), ParseError> {
     // Raw string literal
     let num_hashes = input[offset..].bytes().position(|b| b != b'#')
         .ok_or(perr(None, InvalidLiteral))?;
@@ -194,22 +195,43 @@ pub(crate) fn scan_raw_string<E: Escapee>(
     let hashes = &input[offset..num_hashes + offset];
 
     let mut closing_quote_pos = None;
-    for (i, b) in input[start_inner..].bytes().enumerate() {
-        if b == b'"' && input[start_inner + i + 1..].starts_with(hashes) {
-            closing_quote_pos = Some(i + start_inner);
+    let mut i = start_inner;
+    let mut end_last_escape = start_inner;
+    let mut value = String::new();
+    while i < input.len() {
+        let b = input.as_bytes()[i];
+        if b == b'"' && input[i + 1..].starts_with(hashes) {
+            closing_quote_pos = Some(i);
             break;
         }
 
-        if E::SUPPORTS_UNICODE {
-            if b == b'\r' && input.as_bytes().get(start_inner + i + 1) != Some(&b'\n') {
-                return Err(perr(i + start_inner, IsolatedCr));
+        if b == b'\r' {
+            // Convert `\r\n` into `\n`. This is currently not well documented
+            // in the Rust reference, but is done even for raw strings. That's
+            // because rustc simply converts all line endings when reading
+            // source files.
+            if input.as_bytes().get(i + 1) == Some(&b'\n') {
+                value.push_str(&input[end_last_escape..i]);
+                value.push('\n');
+                i += 2;
+                end_last_escape = i;
+                continue;
+            } else if E::SUPPORTS_UNICODE {
+                // If no \n follows the \r and we are scanning a raw string
+                // (not raw byte string), we error.
+                return Err(perr(i, IsolatedCr))
             }
-        } else {
+        }
+
+        if !E::SUPPORTS_UNICODE {
             if !b.is_ascii() {
-                return Err(perr(i + start_inner, NonAsciiInByteLiteral));
+                return Err(perr(i, NonAsciiInByteLiteral));
             }
         }
+
+        i += 1;
     }
+
     let closing_quote_pos = closing_quote_pos
         .ok_or(perr(None, UnterminatedRawString))?;
 
@@ -217,5 +239,17 @@ pub(crate) fn scan_raw_string<E: Escapee>(
         return Err(perr(closing_quote_pos + num_hashes + 1..input.len(), UnexpectedChar));
     }
 
-    Ok(num_hashes as u32)
+    // `value` is only empty if there was no \r\n in the input string (with the
+    // special case of the input being empty). This means the string value
+    // equals the input, so we store `None`.
+    let value = if value.is_empty() {
+        None
+    } else {
+        // There was an \r\n in the string, so we need to push the remaining
+        // unescaped part of the string still.
+        value.push_str(&input[end_last_escape..closing_quote_pos]);
+        Some(value)
+    };
+
+    Ok((value, num_hashes as u32))
 }
diff --git a/src/string/mod.rs b/src/string/mod.rs
index 1c28857..a21f7a7 100644
--- a/src/string/mod.rs
+++ b/src/string/mod.rs
@@ -70,10 +70,10 @@ impl<B: Buffer> StringLit<B> {
     /// Precondition: input has to start with either `"` or `r`.
     pub(crate) fn parse_impl(input: B) -> Result<Self, ParseError> {
         if input.starts_with('r') {
-            let num_hashes = scan_raw_string::<char>(&input, 1)?;
+            let (value, num_hashes) = scan_raw_string::<char>(&input, 1)?;
             Ok(Self {
                 raw: input,
-                value: None,
+                value,
                 num_hashes: Some(num_hashes),
             })
         } else {
diff --git a/src/string/tests.rs b/src/string/tests.rs
index fb16a15..8d8882e 100644
--- a/src/string/tests.rs
+++ b/src/string/tests.rs
@@ -141,6 +141,15 @@ fn crlf_newlines() {
 
     let lit = StringLit::parse("\"лиса\r\n\"").expect("failed to parse");
     assert_eq!(lit.value(), "лиса\n");
+
+    let lit = StringLit::parse("r\"foo\r\nbar\"").expect("failed to parse");
+    assert_eq!(lit.value(), "foo\nbar");
+
+    let lit = StringLit::parse("r#\"\r\nbar\"#").expect("failed to parse");
+    assert_eq!(lit.value(), "\nbar");
+
+    let lit = StringLit::parse("r##\"лиса\r\n\"##").expect("failed to parse");
+    assert_eq!(lit.value(), "лиса\n");
 }
 
 #[test]