Skip to content

Commit

Permalink
temp
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jun 23, 2022
1 parent 21687c7 commit 1a138d6
Showing 1 changed file with 109 additions and 94 deletions.
203 changes: 109 additions & 94 deletions src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,88 @@ pub struct Attribute<'a> {
}

impl<'a> Attribute<'a> {
/// Normalize the attribute value according to xml specification section 3.3.3
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
/// * Character and entity references are substituted as defined by the spec
pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
let normalized = normalize_attribute_value(self.value.as_ref());
// TODO: character references, entity references, error handling associated with those

#[derive(PartialEq)]
enum ParseState {
Space,
CDATA,
}

// Trim characters from the beginning and end of the attribute value - this can't fail.
fn trim_value(attr: &[u8]) -> &[u8] {
let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));

if first_non_space_char.is_none() {
// The entire value was whitespace-like characters
return b"";
}

let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));

// Trim all whitespace-like characters away from the beginning and end of the attribute value.
let begin = first_non_space_char.unwrap();
let end = last_non_space_char.unwrap_or(attr.len());
&attr[begin..=end]
}

let trimmed_attr = trim_value(self.value.as_ref());

// A new buffer is only created when we encounter a situation that requires it.
let mut normalized: Option<Vec<u8>> = None;
// We start on character data because all whitespace-like characters are already trimmed away.
let mut current_state = ParseState::CDATA;

// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
// buffer and continue using this buffer.
for (idx, ch) in trimmed_attr.iter().enumerate() {
match ch {
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
ParseState::Space => match normalized {
Some(_) => continue,
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
},
ParseState::CDATA => {
current_state = ParseState::Space;
match normalized.as_mut() {
Some(buf) => buf.push(b' '),
None => {
let mut buf = Vec::from(&trimmed_attr[..idx]);
buf.push(b' ');
normalized = Some(buf);
}
}
}
},
c @ _ => match current_state {
ParseState::Space => {
current_state = ParseState::CDATA;
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
ParseState::CDATA => {
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
},
}
}

let normalized = match normalized {
Some(normalized) => Cow::Owned(normalized),
None => Cow::Borrowed(trimmed_attr),
};
let escaped = do_unescape(&*normalized, None)?;
Ok(Cow::Owned(escaped.into_owned()))
}
Expand Down Expand Up @@ -190,90 +269,6 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
}
}

/// Normalize the attribute value according to xml specification section 3.3.3
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
/// * Character and entity references are substituted as defined by the spec
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
// TODO: character references, entity references, error handling associated with those

#[derive(PartialEq)]
enum ParseState {
Space,
CDATA,
}

// Trim characters from the beginning and end of the attribute value - this can't fail.
fn trim_value(attr: &[u8]) -> &[u8] {
let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));

if first_non_space_char.is_none() {
// The entire value was whitespace-like characters
return b"";
}

let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));

// Trim all whitespace-like characters away from the beginning and end of the attribute value.
let begin = first_non_space_char.unwrap();
let end = last_non_space_char.unwrap_or(attr.len());
&attr[begin..=end]
}

let trimmed_attr = trim_value(attr);

// A new buffer is only created when we encounter a situation that requires it.
let mut normalized: Option<Vec<u8>> = None;
// We start on character data because all whitespace-like characters are already trimmed away.
let mut current_state = ParseState::CDATA;

// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
// buffer and continue using this buffer.
for (idx, ch) in trimmed_attr.iter().enumerate() {
match ch {
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
ParseState::Space => match normalized {
Some(_) => continue,
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
},
ParseState::CDATA => {
current_state = ParseState::Space;
match normalized.as_mut() {
Some(buf) => buf.push(b' '),
None => {
let mut buf = Vec::from(&trimmed_attr[..idx]);
buf.push(b' ');
normalized = Some(buf);
}
}
}
},
c @ _ => match current_state {
ParseState::Space => {
current_state = ParseState::CDATA;
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
ParseState::CDATA => {
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
},
}
}

match normalized {
Some(normalized) => Cow::Owned(normalized),
None => Cow::Borrowed(trimmed_attr),
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Iterator over XML attributes.
Expand Down Expand Up @@ -893,36 +888,56 @@ mod xml {
#[test]
fn attribute_value_normalization() {
// empty value
assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b""));
let attr = Attribute::from(("foo", ""));
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b""));

// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
let attr = Attribute::from(("foo", "\rfoo\rbar\tbaz\ndelta\n"));
assert_eq!(
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"),
attr.normalized_value().unwrap(),
Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec())
);

// leading and trailing spaces must be stripped
assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo"));
let attr = Attribute::from(("foo", " foo "));
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"foo"));

// leading space
assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar"));
let attr = Attribute::from(("foo", " bar"));
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"bar"));

// trailing space
assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz"));
let attr = Attribute::from(("foo", "baz "));
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"baz"));

// sequences of spaces must be replaced with a single space
let attr = Attribute::from(("foo", " foo bar baz "));
assert_eq!(
normalize_attribute_value(b" foo bar baz "),
attr.normalized_value().unwrap(),
Cow::Owned::<[u8]>(b"foo bar baz".to_vec())
);

// sequence replacement mixed with characters treated as whitespace (\t \r \n)
let attr = Attribute::from(("foo", " \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"));
assert_eq!(
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"),
attr.normalized_value().unwrap(),
Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec())
);

// character references for whitespace-like characters are not combined after substitution
let attr = Attribute::from(("foo", "&#x20;&#xD0;&#xA0;&#x90;"));
assert_eq!(
normalize_attribute_value(b"&#x20;&#xD0;&#xA0;&#x90;"),
attr.normalized_value().unwrap(),
Cow::Owned::<[u8]>(b" \r\t\n".to_vec())
);

// sequence replacement mixed with characters treated as whitespace (\t \r \n)
let attr = Attribute::from((
"foo",
" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;",
));
assert_eq!(
normalize_attribute_value(b" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;"),
attr.normalized_value().unwrap(),
Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec())
);
}
Expand Down

0 comments on commit 1a138d6

Please sign in to comment.