Skip to content

Commit

Permalink
Fix a bug in UTF-8 decoding.
Browse files Browse the repository at this point in the history
It was possible for an invalid continuation byte to sneak through, which
resulted in incorrect UTF-8 decoding results.

Fixes rust-lang#321
  • Loading branch information
BurntSushi committed Feb 19, 2017
1 parent cffd451 commit 1e3410a
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 2 deletions.
20 changes: 20 additions & 0 deletions src/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
return None;
}
let b1 = src[1];
if 0b11_000000 & b1 != TAG_CONT {
return None;
}
let cp = ((b0 & !TAG_TWO) as u32) << 6
| ((b1 & !TAG_CONT) as u32);
match cp {
Expand All @@ -104,6 +107,12 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
return None;
}
let (b1, b2) = (src[1], src[2]);
if 0b11_000000 & b1 != TAG_CONT {
return None;
}
if 0b11_000000 & b2 != TAG_CONT {
return None;
}
let cp = ((b0 & !TAG_THREE) as u32) << 12
| ((b1 & !TAG_CONT) as u32) << 6
| ((b2 & !TAG_CONT) as u32);
Expand All @@ -118,6 +127,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
return None;
}
let (b1, b2, b3) = (src[1], src[2], src[3]);
if 0b11_000000 & b1 != TAG_CONT {
return None;
}
if 0b11_000000 & b2 != TAG_CONT {
return None;
}
if 0b11_000000 & b3 != TAG_CONT {
return None;
}
let cp = ((b0 & !TAG_FOUR) as u32) << 18
| ((b1 & !TAG_CONT) as u32) << 12
| ((b2 & !TAG_CONT) as u32) << 6
Expand Down Expand Up @@ -236,6 +254,8 @@ mod tests {
assert_eq!(decode_utf8(&[0xFF]), None);
// Surrogate pair
assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
// Invalid continuation byte.
assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
// Bad lengths
assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
Expand Down
3 changes: 1 addition & 2 deletions tests/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ macro_rules! ismatch {
($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
#[test]
fn $name() {
let text = text!($text);
let re = regex!($re);
assert!($ismatch == re.is_match(text));
assert!($ismatch == re.is_match(text!($text)));
}
};
}
Expand Down
15 changes: 15 additions & 0 deletions tests/test_default_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,21 @@ macro_rules! regex_set {
include!("macros_bytes.rs");
include!("macros.rs");

// A silly wrapper to make it possible to write and match raw bytes.
struct R<'a>(&'a [u8]);
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }

// See: https://github.com/rust-lang/regex/issues/321
//
// These tests are here because they do not have the same behavior in every
// regex engine.
mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
Some((1, 3)));
mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
None);

mod api;
mod bytes;
mod crazy;
Expand Down

0 comments on commit 1e3410a

Please sign in to comment.