diff --git a/src/de/mod.rs b/src/de/mod.rs index 569c2bb7..d6460644 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -323,7 +323,7 @@ impl<'de, 'a> de::Deserializer<'de> for &'a mut Deserializer<'de> { b'"' | b'r' => self.deserialize_string(visitor), b'b' => self.deserialize_byte_buf(visitor), b'\'' => self.deserialize_char(visitor), - other => Err(Error::UnexpectedByte(other as char)), + other => Err(Error::UnexpectedByte(other)), } } diff --git a/src/error.rs b/src/error.rs index 2196df97..cfd4b5fb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -66,7 +66,7 @@ pub enum Error { UnclosedBlockComment, UnderscoreAtBeginning, - UnexpectedByte(char), + UnexpectedByte(u8), Utf8Error(Utf8Error), TrailingCharacters, @@ -174,7 +174,11 @@ impl fmt::Display for Error { Error::UnderscoreAtBeginning => { f.write_str("Unexpected leading underscore in a number") } - Error::UnexpectedByte(ref byte) => write!(f, "Unexpected byte {:?}", byte), + Error::UnexpectedByte(byte) => { + let escaped_byte = std::ascii::escape_default(byte) + .map(char::from).collect::(); + write!(f, "Unexpected byte '{}'", escaped_byte) + }, Error::TrailingCharacters => f.write_str("Non-whitespace trailing characters"), Error::InvalidValueForType { ref expected, diff --git a/src/parse.rs b/src/parse.rs index e146ceab..c5125a9b 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -210,6 +210,7 @@ impl<'a> Bytes<'a> { if digit >= base { let _ = bytes.advance(i); + // we know that the byte is an ASCII character here return Err(Error::InvalidIntegerDigit { digit: char::from(byte), base, @@ -433,8 +434,9 @@ impl<'a> Bytes<'a> { let c = if c == b'\\' { let _ = self.advance(1); - match self.parse_escape(EscapeEncoding::Utf8)? { - EscapeCharacter::Ascii(b) => b as char, + match self.parse_escape(EscapeEncoding::Utf8, true)? { + // we know that this byte is an ASCII character + EscapeCharacter::Ascii(b) => char::from(b), EscapeCharacter::Utf8(c) => c, } } else { @@ -767,6 +769,7 @@ impl<'a> Bytes<'a> { _ => (), } + // we know that the byte is an ASCII character here f.push(char::from(*b)); } @@ -1087,7 +1090,7 @@ impl<'a> Bytes<'a> { loop { let _ = self.advance(i + 1); - match self.parse_escape(encoding)? { + match self.parse_escape(encoding, false)? { EscapeCharacter::Ascii(c) => s.push(c), EscapeCharacter::Utf8(c) => match c.len_utf8() { 1 => s.push(c as u8), @@ -1166,7 +1169,7 @@ impl<'a> Bytes<'a> { } } - fn parse_escape(&mut self, encoding: EscapeEncoding) -> Result { + fn parse_escape(&mut self, encoding: EscapeEncoding, is_char: bool) -> Result { let c = match self.eat_byte()? { b'\'' => EscapeCharacter::Ascii(b'\''), b'"' => EscapeCharacter::Ascii(b'"'), @@ -1176,11 +1179,46 @@ impl<'a> Bytes<'a> { b't' => EscapeCharacter::Ascii(b'\t'), b'0' => EscapeCharacter::Ascii(b'\0'), b'x' => { - let b = self.decode_ascii_escape()?; - match encoding { - EscapeEncoding::Binary => EscapeCharacter::Ascii(b), - EscapeEncoding::Utf8 => EscapeCharacter::Utf8(b as char), + // Fast exit for ascii escape in byte string + let b: u8 = self.decode_ascii_escape()?; + if let EscapeEncoding::Binary = encoding { + return Ok(EscapeCharacter::Ascii(b)); } + + // Fast exit for ascii character in UTF-8 string + let mut bytes = [b, 0, 0, 0]; + if let Ok(Some(c)) = from_utf8(&bytes[..=0]).map(|s| s.chars().next()) { + return Ok(EscapeCharacter::Utf8(c)); + } + + if is_char { + // Character literals are not allowed to use multiple byte + // escapes to build a unicode character + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); + } + + // UTF-8 character needs up to four bytes and we have already + // consumed one, so at most three to go + for i in 1..4 { + if !self.consume(r"\x") { + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); + } + + bytes[i] = self.decode_ascii_escape()?; + + // Check if we now have a valid UTF-8 character + if let Ok(Some(c)) = from_utf8(&bytes[..=i]).map(|s| s.chars().next()) { + return Ok(EscapeCharacter::Utf8(c)); + } + } + + return Err(Error::InvalidEscape( + "Not a valid byte-escaped Unicode character", + )); } b'u' => { self.expect_byte(b'{', Error::InvalidEscape("Missing { in Unicode escape"))?; @@ -1214,7 +1252,9 @@ impl<'a> Bytes<'a> { b'}', Error::InvalidEscape("No } at the end of Unicode escape"), )?; - let c = char_from_u32(bytes).ok_or(Error::InvalidEscape("Not a valid char"))?; + let c = char_from_u32(bytes).ok_or(Error::InvalidEscape( + "Not a valid Unicode-escaped character", + ))?; EscapeCharacter::Utf8(c) } @@ -1260,7 +1300,7 @@ impl<'a> Bytes<'a> { } } } - b => return Err(Error::UnexpectedByte(b as char)), + b => return Err(Error::UnexpectedByte(b)), } Ok(true) diff --git a/src/value/mod.rs b/src/value/mod.rs index 69eb84e5..85188c40 100644 --- a/src/value/mod.rs +++ b/src/value/mod.rs @@ -229,7 +229,7 @@ mod tests { fn string() { assert_same::(r#""hello world""#); assert_same::(r#""this is a Rusty 🦀 string""#); - assert_same::(r#""this is now valid UTF-8 \xf8\xa1\xa1\xa1\xa1""#); + assert_same::(r#""this is now valid UTF-8 \xf0\x9f\xa6\x80""#); } #[test] diff --git a/tests/407_raw_value.rs b/tests/407_raw_value.rs index 32eafad0..2bf782d3 100644 --- a/tests/407_raw_value.rs +++ b/tests/407_raw_value.rs @@ -55,7 +55,7 @@ fn test_raw_value_invalid() { assert_eq!( err, SpannedError { - code: Error::UnexpectedByte('\0'), + code: Error::UnexpectedByte(b'\0'), position: Position { line: 1, col: 1 } } ) diff --git a/tests/438_rusty_byte_strings.rs b/tests/438_rusty_byte_strings.rs index 58a273ff..0b52363a 100644 --- a/tests/438_rusty_byte_strings.rs +++ b/tests/438_rusty_byte_strings.rs @@ -258,3 +258,56 @@ fn check_roundtrip< let de_raw = ron::from_str::(&ron_raw).unwrap(); assert_eq!(de_raw, val); } + +#[test] +fn test_weird_escapes() { + assert_eq!( + ron::from_str::(r#""\u{1F980}""#), + Ok(String::from("\u{1F980}")) + ); + assert_eq!( + ron::from_str::(r#"b"\xf0\x9f\xa6\x80""#), + Ok(bytes::Bytes::copy_from_slice("\u{1F980}".as_bytes())) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\xa6\x80""#), + Ok(String::from("\u{1F980}")) + ); + assert_eq!( + ron::from_str::(r#""\xf0""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 6 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 10 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\x40""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 14 } + }) + ); + assert_eq!( + ron::from_str::(r#""\xf0\x9f\xa6""#), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 14 } + }) + ); + + assert_eq!(ron::from_str::(r"'\u{1F980}'"), Ok('\u{1F980}')); + assert_eq!( + ron::from_str::(r"'\xf0\x9f\xa6\x80'"), + Err(SpannedError { + code: Error::InvalidEscape("Not a valid byte-escaped Unicode character"), + position: Position { line: 1, col: 6 } + }) + ); +}