diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 7939edac..df47faf4 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -33,9 +33,88 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { + /// Normalize the attribute value according to xml specification section 3.3.3 /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value + /// * Sequences of whitespace-like characters are replaced with a single whitespace character + /// * Character and entity references are substituted as defined by the spec pub fn normalized_value(&'a self) -> Result, EscapeError> { - let normalized = normalize_attribute_value(self.value.as_ref()); + // TODO: character references, entity references, error handling associated with those + + #[derive(PartialEq)] + enum ParseState { + Space, + CDATA, + } + + // Trim characters from the beginning and end of the attribute value - this can't fail. + fn trim_value(attr: &[u8]) -> &[u8] { + let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c)); + + if first_non_space_char.is_none() { + // The entire value was whitespace-like characters + return b""; + } + + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c)); + + // Trim all whitespace-like characters away from the beginning and end of the attribute value. + let begin = first_non_space_char.unwrap(); + let end = last_non_space_char.unwrap_or(attr.len()); + &attr[begin..=end] + } + + let trimmed_attr = trim_value(self.value.as_ref()); + + // A new buffer is only created when we encounter a situation that requires it. + let mut normalized: Option> = None; + // We start on character data because all whitespace-like characters are already trimmed away. + let mut current_state = ParseState::CDATA; + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + for (idx, ch) in trimmed_attr.iter().enumerate() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::Space => match normalized { + Some(_) => continue, + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), + }, + ParseState::CDATA => { + current_state = ParseState::Space; + match normalized.as_mut() { + Some(buf) => buf.push(b' '), + None => { + let mut buf = Vec::from(&trimmed_attr[..idx]); + buf.push(b' '); + normalized = Some(buf); + } + } + } + }, + c @ _ => match current_state { + ParseState::Space => { + current_state = ParseState::CDATA; + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + ParseState::CDATA => { + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + }, + } + } + + let normalized = match normalized { + Some(normalized) => Cow::Owned(normalized), + None => Cow::Borrowed(trimmed_attr), + }; let escaped = do_unescape(&*normalized, None)?; Ok(Cow::Owned(escaped.into_owned())) } @@ -190,90 +269,6 @@ impl<'a> From> for Attribute<'a> { } } -/// Normalize the attribute value according to xml specification section 3.3.3 -/// -/// https://www.w3.org/TR/xml/#AVNormalize -/// -/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value -/// * Sequences of whitespace-like characters are replaced with a single whitespace character -/// * Character and entity references are substituted as defined by the spec -fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { - // TODO: character references, entity references, error handling associated with those - - #[derive(PartialEq)] - enum ParseState { - Space, - CDATA, - } - - // Trim characters from the beginning and end of the attribute value - this can't fail. - fn trim_value(attr: &[u8]) -> &[u8] { - let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c)); - - if first_non_space_char.is_none() { - // The entire value was whitespace-like characters - return b""; - } - - let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c)); - - // Trim all whitespace-like characters away from the beginning and end of the attribute value. - let begin = first_non_space_char.unwrap(); - let end = last_non_space_char.unwrap_or(attr.len()); - &attr[begin..=end] - } - - let trimmed_attr = trim_value(attr); - - // A new buffer is only created when we encounter a situation that requires it. - let mut normalized: Option> = None; - // We start on character data because all whitespace-like characters are already trimmed away. - let mut current_state = ParseState::CDATA; - - // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference - // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new - // buffer and continue using this buffer. - for (idx, ch) in trimmed_attr.iter().enumerate() { - match ch { - b'\n' | b'\r' | b'\t' | b' ' => match current_state { - ParseState::Space => match normalized { - Some(_) => continue, - None => normalized = Some(Vec::from(&trimmed_attr[..idx])), - }, - ParseState::CDATA => { - current_state = ParseState::Space; - match normalized.as_mut() { - Some(buf) => buf.push(b' '), - None => { - let mut buf = Vec::from(&trimmed_attr[..idx]); - buf.push(b' '); - normalized = Some(buf); - } - } - } - }, - c @ _ => match current_state { - ParseState::Space => { - current_state = ParseState::CDATA; - if let Some(normalized) = normalized.as_mut() { - normalized.push(*c); - } - } - ParseState::CDATA => { - if let Some(normalized) = normalized.as_mut() { - normalized.push(*c); - } - } - }, - } - } - - match normalized { - Some(normalized) => Cow::Owned(normalized), - None => Cow::Borrowed(trimmed_attr), - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// /// Iterator over XML attributes. @@ -893,36 +888,56 @@ mod xml { #[test] fn attribute_value_normalization() { // empty value - assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b"")); + let attr = Attribute::from(("foo", "")); + assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"")); + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + let attr = Attribute::from(("foo", "\rfoo\rbar\tbaz\ndelta\n")); assert_eq!( - normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"), + attr.normalized_value().unwrap(), Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec()) ); + // leading and trailing spaces must be stripped - assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo")); + let attr = Attribute::from(("foo", " foo ")); + assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"foo")); + // leading space - assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar")); + let attr = Attribute::from(("foo", " bar")); + assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"bar")); + // trailing space - assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz")); + let attr = Attribute::from(("foo", "baz ")); + assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"baz")); + // sequences of spaces must be replaced with a single space + let attr = Attribute::from(("foo", " foo bar baz ")); assert_eq!( - normalize_attribute_value(b" foo bar baz "), + attr.normalized_value().unwrap(), Cow::Owned::<[u8]>(b"foo bar baz".to_vec()) ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + let attr = Attribute::from(("foo", " \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r")); assert_eq!( - normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"), + attr.normalized_value().unwrap(), Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec()) ); + // character references for whitespace-like characters are not combined after substitution + let attr = Attribute::from(("foo", " Р")); assert_eq!( - normalize_attribute_value(b" Р"), + attr.normalized_value().unwrap(), Cow::Owned::<[u8]>(b" \r\t\n".to_vec()) ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + let attr = Attribute::from(( + "foo", + " foo\tbar baz  delta\n\r echo foxtrotÐ", + )); assert_eq!( - normalize_attribute_value(b" foo\tbar baz  delta\n\r echo foxtrotÐ"), + attr.normalized_value().unwrap(), Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec()) ); }