From 8741abc5a96217fabd0cbf63d60b6ccb173cc213 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 9 Jun 2024 17:23:45 +0500 Subject: [PATCH 1/4] Move bang type checks outside of loop (Review in whitespace changes ignored mode) --- src/reader/mod.rs | 55 ++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 63a42574..759ba6aa 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -974,32 +974,35 @@ impl BangType { /// - `chunk`: data read on current iteration and not yet consumed from reader #[inline(always)] fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr_iter(b'>', chunk) { - match self { - // Need to read at least 6 symbols (`!---->`) for properly finished comment - // - XML comment - // 012345 - i - Self::Comment if buf.len() + i > 4 => { - if chunk[..i].ends_with(b"--") { - // We cannot strip last `--` from the buffer because we need it in case of - // check_comments enabled option. XML standard requires that comment - // will not end with `--->` sequence because this is a special case of - // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `-|->` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `--|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"--") { - return Some((&[], i + 1)); // +1 for `>` + match self { + Self::Comment => { + for i in memchr::memchr_iter(b'>', chunk) { + // Need to read at least 6 symbols (`!---->`) for properly finished comment + // - XML comment + // 012345 - i + if buf.len() + i > 4 { + if chunk[..i].ends_with(b"--") { + // We cannot strip last `--` from the buffer because we need it in case of + // check_comments enabled option. XML standard requires that comment + // will not end with `--->` sequence because this is a special case of + // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `-|->` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `--|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"--") { + return Some((&[], i + 1)); // +1 for `>` + } } } - Self::Comment => {} - Self::CData => { + } + Self::CData => { + for i in memchr::memchr_iter(b'>', chunk) { if chunk[..i].ends_with(b"]]") { return Some((&chunk[..i], i + 1)); // +1 for `>` } @@ -1014,7 +1017,9 @@ impl BangType { return Some((&[], i + 1)); // +1 for `>` } } - Self::DocType => { + } + Self::DocType => { + for i in memchr::memchr_iter(b'>', chunk) { let content = &chunk[..i]; let balance = memchr::memchr2_iter(b'<', b'>', content) .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) From 89e5360f5a00f4cda36080e03e0dc46f94273ec0 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 4 Jul 2024 22:01:39 +0500 Subject: [PATCH 2/4] Add test for lowercase failures (6): syntax::cdata::lowercase::ns_reader::async_tokio syntax::cdata::lowercase::ns_reader::borrowed syntax::cdata::lowercase::ns_reader::buffered syntax::cdata::lowercase::reader::async_tokio syntax::cdata::lowercase::reader::borrowed syntax::cdata::lowercase::reader::buffered --- tests/reader-errors.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index 18c08a23..34880c15 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -343,6 +343,8 @@ mod syntax { err!(unclosed24(" SyntaxError::UnclosedCData); err!(unclosed25("") => SyntaxError::UnclosedCData); + err!(lowercase("") => SyntaxError::UnclosedCData); + ok!(normal1("") => 12: Event::CData(BytesCData::new(""))); ok!(normal2("rest") => 12: Event::CData(BytesCData::new(""))); } From 6dec2b22c2cd47f814854e5a06f173e4a21269d7 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 4 Jul 2024 00:40:40 +0500 Subject: [PATCH 3/4] Only uppercase ` { + // XML requires uppercase only: + // https://www.w3.org/TR/xml11/#sec-cdata-sect + // Even HTML5 required uppercase only: + // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + BangType::CData if buf.starts_with(b"![CDATA[") => { debug_assert!(buf.ends_with(b"]]")); Ok(Event::CData(BytesCData::wrap( // Cut of `![CDATA[` and `]]` from start and end @@ -136,6 +140,10 @@ impl ReaderState { self.decoder(), ))) } + // XML requires uppercase only, but we will check that on validation stage: + // https://www.w3.org/TR/xml11/#sec-prolog-dtd + // HTML5 allows mixed case for doctype declarations: + // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( From b71cf7c92b40303d883c2d3411b6d9f07288240c Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 4 Jul 2024 00:45:06 +0500 Subject: [PATCH 4/4] Use only uppercase DOCTYPE in tests where possible as required by the specification See https://www.w3.org/TR/xml11/#NT-doctypedecl --- tests/fuzzing.rs | 2 +- tests/reader-config.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index deca382a..2740763c 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -51,7 +51,7 @@ fn fuzz_101() { #[test] fn fuzz_empty_doctype() { - let data: &[u8] = b""; + let data: &[u8] = b""; let mut reader = Reader::from_reader(data); let mut buf = Vec::new(); assert!(matches!( diff --git a/tests/reader-config.rs b/tests/reader-config.rs index bd2465e3..781bca8c 100644 --- a/tests/reader-config.rs +++ b/tests/reader-config.rs @@ -471,7 +471,7 @@ mod trim_markup_names_in_closing_tags { } const XML: &str = " \t\r\n\ - \t\r\n\ + \t\r\n\ \t\r\n\ \t\r\n\ text \t\r\n\