Skip to content

Commit

Permalink
Fix parsing of byte escapes in UTF-8 strings to produce proper Unicod…
Browse files Browse the repository at this point in the history
…e characters
  • Loading branch information
juntyr committed Aug 24, 2023
1 parent 889f908 commit 7abb5ea
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ impl<'de, 'a> de::Deserializer<'de> for &'a mut Deserializer<'de> {
b'"' | b'r' => self.deserialize_string(visitor),
b'b' => self.deserialize_byte_buf(visitor),
b'\'' => self.deserialize_char(visitor),
other => Err(Error::UnexpectedByte(other as char)),
other => Err(Error::UnexpectedByte(other)),
}
}

Expand Down
8 changes: 6 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ pub enum Error {

UnclosedBlockComment,
UnderscoreAtBeginning,
UnexpectedByte(char),
UnexpectedByte(u8),

Utf8Error(Utf8Error),
TrailingCharacters,
Expand Down Expand Up @@ -174,7 +174,11 @@ impl fmt::Display for Error {
Error::UnderscoreAtBeginning => {
f.write_str("Unexpected leading underscore in a number")
}
Error::UnexpectedByte(ref byte) => write!(f, "Unexpected byte {:?}", byte),
Error::UnexpectedByte(byte) => {
let escaped_byte = std::ascii::escape_default(byte)
.map(char::from).collect::<String>();
write!(f, "Unexpected byte '{}'", escaped_byte)
},
Error::TrailingCharacters => f.write_str("Non-whitespace trailing characters"),
Error::InvalidValueForType {
ref expected,
Expand Down
60 changes: 50 additions & 10 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ impl<'a> Bytes<'a> {

if digit >= base {
let _ = bytes.advance(i);
// we know that the byte is an ASCII character here
return Err(Error::InvalidIntegerDigit {
digit: char::from(byte),
base,
Expand Down Expand Up @@ -433,8 +434,9 @@ impl<'a> Bytes<'a> {
let c = if c == b'\\' {
let _ = self.advance(1);

match self.parse_escape(EscapeEncoding::Utf8)? {
EscapeCharacter::Ascii(b) => b as char,
match self.parse_escape(EscapeEncoding::Utf8, true)? {
// we know that this byte is an ASCII character
EscapeCharacter::Ascii(b) => char::from(b),
EscapeCharacter::Utf8(c) => c,
}
} else {
Expand Down Expand Up @@ -767,6 +769,7 @@ impl<'a> Bytes<'a> {
_ => (),
}

// we know that the byte is an ASCII character here
f.push(char::from(*b));
}

Expand Down Expand Up @@ -1087,7 +1090,7 @@ impl<'a> Bytes<'a> {
loop {
let _ = self.advance(i + 1);

match self.parse_escape(encoding)? {
match self.parse_escape(encoding, false)? {
EscapeCharacter::Ascii(c) => s.push(c),
EscapeCharacter::Utf8(c) => match c.len_utf8() {
1 => s.push(c as u8),
Expand Down Expand Up @@ -1166,7 +1169,7 @@ impl<'a> Bytes<'a> {
}
}

fn parse_escape(&mut self, encoding: EscapeEncoding) -> Result<EscapeCharacter> {
fn parse_escape(&mut self, encoding: EscapeEncoding, is_char: bool) -> Result<EscapeCharacter> {
let c = match self.eat_byte()? {
b'\'' => EscapeCharacter::Ascii(b'\''),
b'"' => EscapeCharacter::Ascii(b'"'),
Expand All @@ -1176,11 +1179,46 @@ impl<'a> Bytes<'a> {
b't' => EscapeCharacter::Ascii(b'\t'),
b'0' => EscapeCharacter::Ascii(b'\0'),
b'x' => {
let b = self.decode_ascii_escape()?;
match encoding {
EscapeEncoding::Binary => EscapeCharacter::Ascii(b),
EscapeEncoding::Utf8 => EscapeCharacter::Utf8(b as char),
// Fast exit for ascii escape in byte string
let b: u8 = self.decode_ascii_escape()?;
if let EscapeEncoding::Binary = encoding {
return Ok(EscapeCharacter::Ascii(b));
}

// Fast exit for ascii character in UTF-8 string
let mut bytes = [b, 0, 0, 0];
if let Ok(Some(c)) = from_utf8(&bytes[..=0]).map(|s| s.chars().next()) {
return Ok(EscapeCharacter::Utf8(c));
}

if is_char {
// Character literals are not allowed to use multiple byte
// escapes to build a unicode character
return Err(Error::InvalidEscape(
"Not a valid byte-escaped Unicode character",
));
}

// UTF-8 character needs up to four bytes and we have already
// consumed one, so at most three to go
for i in 1..4 {
if !self.consume(r"\x") {
return Err(Error::InvalidEscape(
"Not a valid byte-escaped Unicode character",
));
}

bytes[i] = self.decode_ascii_escape()?;

// Check if we now have a valid UTF-8 character
if let Ok(Some(c)) = from_utf8(&bytes[..=i]).map(|s| s.chars().next()) {
return Ok(EscapeCharacter::Utf8(c));
}
}

return Err(Error::InvalidEscape(
"Not a valid byte-escaped Unicode character",
));
}
b'u' => {
self.expect_byte(b'{', Error::InvalidEscape("Missing { in Unicode escape"))?;
Expand Down Expand Up @@ -1214,7 +1252,9 @@ impl<'a> Bytes<'a> {
b'}',
Error::InvalidEscape("No } at the end of Unicode escape"),
)?;
let c = char_from_u32(bytes).ok_or(Error::InvalidEscape("Not a valid char"))?;
let c = char_from_u32(bytes).ok_or(Error::InvalidEscape(
"Not a valid Unicode-escaped character",
))?;

EscapeCharacter::Utf8(c)
}
Expand Down Expand Up @@ -1260,7 +1300,7 @@ impl<'a> Bytes<'a> {
}
}
}
b => return Err(Error::UnexpectedByte(b as char)),
b => return Err(Error::UnexpectedByte(b)),
}

Ok(true)
Expand Down
2 changes: 1 addition & 1 deletion src/value/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ mod tests {
fn string() {
assert_same::<String>(r#""hello world""#);
assert_same::<String>(r#""this is a Rusty 🦀 string""#);
assert_same::<String>(r#""this is now valid UTF-8 \xf8\xa1\xa1\xa1\xa1""#);
assert_same::<String>(r#""this is now valid UTF-8 \xf0\x9f\xa6\x80""#);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion tests/407_raw_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ fn test_raw_value_invalid() {
assert_eq!(
err,
SpannedError {
code: Error::UnexpectedByte('\0'),
code: Error::UnexpectedByte(b'\0'),
position: Position { line: 1, col: 1 }
}
)
Expand Down
53 changes: 53 additions & 0 deletions tests/438_rusty_byte_strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,56 @@ fn check_roundtrip<
let de_raw = ron::from_str::<T>(&ron_raw).unwrap();
assert_eq!(de_raw, val);
}

#[test]
fn test_weird_escapes() {
assert_eq!(
ron::from_str::<String>(r#""\u{1F980}""#),
Ok(String::from("\u{1F980}"))
);
assert_eq!(
ron::from_str::<bytes::Bytes>(r#"b"\xf0\x9f\xa6\x80""#),
Ok(bytes::Bytes::copy_from_slice("\u{1F980}".as_bytes()))
);
assert_eq!(
ron::from_str::<String>(r#""\xf0\x9f\xa6\x80""#),
Ok(String::from("\u{1F980}"))
);
assert_eq!(
ron::from_str::<String>(r#""\xf0""#),
Err(SpannedError {
code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"),
position: Position { line: 1, col: 6 }
})
);
assert_eq!(
ron::from_str::<String>(r#""\xf0\x9f""#),
Err(SpannedError {
code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"),
position: Position { line: 1, col: 10 }
})
);
assert_eq!(
ron::from_str::<String>(r#""\xf0\x9f\x40""#),
Err(SpannedError {
code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"),
position: Position { line: 1, col: 14 }
})
);
assert_eq!(
ron::from_str::<String>(r#""\xf0\x9f\xa6""#),
Err(SpannedError {
code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"),
position: Position { line: 1, col: 14 }
})
);

assert_eq!(ron::from_str::<char>(r"'\u{1F980}'"), Ok('\u{1F980}'));
assert_eq!(
ron::from_str::<char>(r"'\xf0\x9f\xa6\x80'"),
Err(SpannedError {
code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"),
position: Position { line: 1, col: 6 }
})
);
}

0 comments on commit 7abb5ea

Please sign in to comment.