From 9307786fee873f556f7285749529ad42e38f2c9d Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH] Properly normalize attribute values closes #371 --- src/escapei.rs | 2 +- src/events/attributes.rs | 110 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 2 deletions(-) diff --git a/src/escapei.rs b/src/escapei.rs index 30236d6f..ecc9e946 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -131,7 +131,7 @@ pub fn unescape(raw: &[u8]) -> Result, EscapeError> { } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. +/// value, using a dictionary of custom entities. /// /// # Pre-condition /// diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d5aa7db2..4ac066c0 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -331,6 +331,87 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> { } } +/// Normalize the attribute value according to xml specification section 3.3.3 +/// +/// https://www.w3.org/TR/xml/#AVNormalize +/// +/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value +/// * Sequences of whitespace-like characters are replaced with a single whitespace character +/// * Character and entity references are substituted as defined by the spec +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { + // TODO: character references, entity references, error handling associated with those + + #[derive(PartialEq)] + enum ParseState { + Space, + CDATA, + } + + let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' '); + + let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c)); + + if first_non_space_char.is_none() { + // The entire value was whitespace-like characters + return Cow::Borrowed(b""); + } + + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c)); + + // Trim all whitespace-like characters away from the beginning and end of the attribute value. + let begin = first_non_space_char.unwrap(); + let end = last_non_space_char.unwrap_or(attr.len()); + let trimmed_attr = &attr[begin..=end]; + + // A new buffer is only created when we encounter a situation that requires it. + let mut normalized: Option> = None; + // We start on character data because all whitespace-like characters are already trimmed away. + let mut current_state = ParseState::CDATA; + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + for (idx, ch) in trimmed_attr.iter().enumerate() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::Space => match normalized { + Some(_) => continue, + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), + }, + ParseState::CDATA => { + current_state = ParseState::Space; + match normalized.as_mut() { + Some(buf) => buf.push(b' '), + None => { + let mut buf = Vec::from(&trimmed_attr[..idx]); + buf.push(b' '); + normalized = Some(buf); + } + } + } + }, + c @ _ => match current_state { + ParseState::Space => { + current_state = ParseState::CDATA; + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + ParseState::CDATA => { + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + }, + } + } + + match normalized { + Some(normalized) => Cow::Owned(normalized), + None => Cow::Borrowed(trimmed_attr), + } +} + impl<'a> Iterator for Attributes<'a> { type Item = Result>; fn next(&mut self) -> Option { @@ -355,7 +436,7 @@ impl<'a> Iterator for Attributes<'a> { ($key:expr, $val:expr) => { Some(Ok(Attribute { key: &self.bytes[$key], - value: Cow::Borrowed(&self.bytes[$val]), + value: normalize_attribute_value(&self.bytes[$val]), })) }; } @@ -513,4 +594,31 @@ mod tests { assert_eq!(&*a.value, b"ee"); assert!(attributes.next().is_none()); } + + #[test] + fn attribute_value_normalization() { + // empty value + assert_eq!(normalize_attribute_value(b"").as_ref(), b""); + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + assert_eq!( + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(), + b"foo bar baz delta" + ); + // leading and trailing spaces must be stripped + assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo"); + // leading space + assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar"); + // trailing space + assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz"); + // sequences of spaces must be replaced with a single space + assert_eq!( + normalize_attribute_value(b" foo bar baz ").as_ref(), + b"foo bar baz" + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(), + b"foo bar baz delta echo foxtrot" + ); + } }