Skip to content

Commit

Permalink
Merge pull request #415 from dralley/escape-api
Browse files Browse the repository at this point in the history
Closure-based unescaping with custom entities
  • Loading branch information
Mingun authored Jul 15, 2022
2 parents 8c622f0 + a4d50a5 commit 72e11d1
Show file tree
Hide file tree
Showing 14 changed files with 142 additions and 259 deletions.
11 changes: 10 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,15 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`

- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
to better communicate their function. Renamed functions following the pattern `*_with_custom_entities`
to `decode_and_unescape_with` to be more consistent across the API.
- [#415]: `BytesText::escaped()` renamed to `BytesText::escape()`, `BytesText::unescaped()` renamed to
`BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
`Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
renamed to `Attribute::escape_value_with()` for consistency across the API.
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events

Expand Down Expand Up @@ -137,6 +145,7 @@
[#403]: https://github.com/tafia/quick-xml/pull/403
[#407]: https://github.com/tafia/quick-xml/pull/407
[#412]: https://github.com/tafia/quick-xml/pull/412
[#415]: https://github.com/tafia/quick-xml/pull/415
[#416]: https://github.com/tafia/quick-xml/pull/416
[#418]: https://github.com/tafia/quick-xml/pull/418

Expand Down
4 changes: 2 additions & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
match r.read_event()? {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.unescaped_value()?);
criterion::black_box(attr?.unescape_value()?);
}
}
Event::Text(e) => {
criterion::black_box(e.unescaped()?);
criterion::black_box(e.unescape()?);
}
Event::CData(e) => {
criterion::black_box(e.into_inner());
Expand Down
83 changes: 1 addition & 82 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event_into(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event_into(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

/// Benchmarks, how fast individual event parsed
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
Expand Down Expand Up @@ -256,7 +176,7 @@ fn one_event(c: &mut Criterion) {
.check_comments(false)
.trim_text(true);
match r.read_event_into(&mut buf) {
Ok(Event::Comment(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down Expand Up @@ -473,7 +393,6 @@ purus. Consequat id porta nibh venenatis cras sed felis.";
criterion_group!(
benches,
read_event,
bytes_text_unescaped,
read_namespaced_event,
one_event,
attributes,
Expand Down
39 changes: 23 additions & 16 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
//! * the regex in this example is simple but brittle;
//! * it does not support the use of entities in entity declaration.

use std::collections::HashMap;

use quick_xml::events::Event;
use quick_xml::Reader;
use regex::bytes::Regex;
use std::collections::HashMap;

const DATA: &str = r#"
Expand All @@ -27,35 +28,41 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities = HashMap::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
custom_entities.insert(
cap[1].to_vec(),
reader.decoder().decode(&cap[2])?.into_owned(),
);
}
}
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"test" => println!(
"attributes values: {:?}",
e.attributes()
.map(|a| a
.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
&custom_entities
)
.unwrap())
.collect::<Vec<_>>()
),
b"test" => {
let attributes = e
.attributes()
.map(|a| {
a.unwrap()
.decode_and_unescape_value_with(&reader, |ent| {
custom_entities.get(ent).map(|s| s.as_str())
})
.unwrap()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
println!(
"text value: {}",
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
e.decode_and_unescape_with(&reader, |ent| custom_entities
.get(ent)
.map(|s| s.as_str()))
.unwrap()
);
}
Expand Down
4 changes: 2 additions & 2 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -618,14 +618,14 @@ where
allow_start: bool,
) -> Result<BytesCData<'de>, DeError> {
match self.next()? {
DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
DeEvent::CData(e) => Ok(e),
DeEvent::Start(e) if allow_start => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) if unescape => t.unescape()?,
DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
DeEvent::Text(t) => BytesCData::new(t.into_inner()),
DeEvent::CData(t) => t,
DeEvent::Start(s) => {
Expand Down
86 changes: 32 additions & 54 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

#[cfg(test)]
Expand Down Expand Up @@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -130,32 +113,23 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
do_unescape(raw, None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_with<'a>(
raw: &'a [u8],
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
do_unescape(raw, Some(custom_entities))
unescape_with(raw, |_| None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using an optional dictionary of custom entities.
/// value, using a resolver function for custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn do_unescape<'a>(
raw: &'a [u8],
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_with<'input, 'entity, F>(
raw: &'input [u8],
resolve_entity: F,
) -> Result<Cow<'input, [u8]>, EscapeError>
where
// the lifetime of the output comes from a capture or is `'static`
F: Fn(&[u8]) -> Option<&'entity str>,
{
let mut unescaped = None;
let mut last_end = 0;
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
Expand All @@ -171,12 +145,14 @@ pub fn do_unescape<'a>(

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
if pat.starts_with(b"#") {
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else if let Some(value) = resolve_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else {
return Err(EscapeError::UnrecognizedSymbol(
start + 1..end,
Expand Down Expand Up @@ -1740,18 +1716,20 @@ fn test_unescape() {

#[test]
fn test_unescape_with() {
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
.into_iter()
.collect();
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
let custom_entities = |ent: &[u8]| match ent {
b"foo" => Some("BAR"),
_ => None,
};

assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
assert_eq!(
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
b"<test>"
);
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", custom_entities).is_err());
}

#[test]
Expand Down
Loading

0 comments on commit 72e11d1

Please sign in to comment.