Merge pull request #415 from dralley/escape-api

Closure-based unescaping with custom entities
tafia · Jul 15, 2022 · 72e11d1 · 72e11d1
2 parents 8c622f0 + a4d50a5
commit 72e11d1
Show file tree

Hide file tree

Showing 14 changed files with 142 additions and 259 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -109,7 +109,15 @@
   |`read_event_unbuffered`  |`read_event`
   |`read_to_end_unbuffered` |`read_to_end`
 - [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
-
+- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
+  replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
+- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
+  to better communicate their function. Renamed functions following the pattern `*_with_custom_entities`
+  to `decode_and_unescape_with` to be more consistent across the API.
+- [#415]: `BytesText::escaped()` renamed to `BytesText::escape()`, `BytesText::unescaped()` renamed to
+  `BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
+  `Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
+  renamed to `Attribute::escape_value_with()` for consistency across the API.
 - [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
   added to all events
 
@@ -137,6 +145,7 @@
 [#403]: https://github.com/tafia/quick-xml/pull/403
 [#407]: https://github.com/tafia/quick-xml/pull/407
 [#412]: https://github.com/tafia/quick-xml/pull/412
+[#415]: https://github.com/tafia/quick-xml/pull/415
 [#416]: https://github.com/tafia/quick-xml/pull/416
 [#418]: https://github.com/tafia/quick-xml/pull/418
 

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
         match r.read_event()? {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.unescaped_value()?);
+                    criterion::black_box(attr?.unescape_value()?);
                 }
             }
             Event::Text(e) => {
-                criterion::black_box(e.unescaped()?);
+                criterion::black_box(e.unescape()?);
             }
             Event::CData(e) => {
                 criterion::black_box(e.into_inner());

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
     group.finish();
 }
 
-/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
-/// benchmark)
-fn bytes_text_unescaped(c: &mut Criterion) {
-    let mut group = c.benchmark_group("BytesText::unescaped");
-    group.bench_function("trim_text = false", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false).check_comments(false);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event_into(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 67661,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 66277,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-
-    group.bench_function("trim_text = true", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false)
-                .check_comments(false)
-                .trim_text(true);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event_into(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 50334,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 50261,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-    group.finish();
-}
-
 /// Benchmarks, how fast individual event parsed
 fn one_event(c: &mut Criterion) {
     let mut group = c.benchmark_group("One event");
@@ -256,7 +176,7 @@ fn one_event(c: &mut Criterion) {
                 .check_comments(false)
                 .trim_text(true);
             match r.read_event_into(&mut buf) {
-                Ok(Event::Comment(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
                 something_else => panic!("Did not expect {:?}", something_else),
             };
 
@@ -473,7 +393,6 @@ purus. Consequat id porta nibh venenatis cras sed felis.";
 criterion_group!(
     benches,
     read_event,
-    bytes_text_unescaped,
     read_namespaced_event,
     one_event,
     attributes,

diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs
@@ -7,10 +7,11 @@
 //! * the regex in this example is simple but brittle;
 //! * it does not support the use of entities in entity declaration.
 
+use std::collections::HashMap;
+
 use quick_xml::events::Event;
 use quick_xml::Reader;
 use regex::bytes::Regex;
-use std::collections::HashMap;
 
 const DATA: &str = r#"
 
@@ -27,35 +28,41 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     reader.trim_text(true);
 
     let mut buf = Vec::new();
-    let mut custom_entities = HashMap::new();
+    let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
     let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
 
     loop {
         match reader.read_event_into(&mut buf) {
             Ok(Event::DocType(ref e)) => {
                 for cap in entity_re.captures_iter(&e) {
-                    custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
+                    custom_entities.insert(
+                        cap[1].to_vec(),
+                        reader.decoder().decode(&cap[2])?.into_owned(),
+                    );
                 }
             }
             Ok(Event::Start(ref e)) => match e.name().as_ref() {
-                b"test" => println!(
-                    "attributes values: {:?}",
-                    e.attributes()
-                        .map(|a| a
-                            .unwrap()
-                            .unescape_and_decode_value_with_custom_entities(
-                                &reader,
-                                &custom_entities
-                            )
-                            .unwrap())
-                        .collect::<Vec<_>>()
-                ),
+                b"test" => {
+                    let attributes = e
+                        .attributes()
+                        .map(|a| {
+                            a.unwrap()
+                                .decode_and_unescape_value_with(&reader, |ent| {
+                                    custom_entities.get(ent).map(|s| s.as_str())
+                                })
+                                .unwrap()
+                        })
+                        .collect::<Vec<_>>();
+                    println!("attributes values: {:?}", attributes);
+                }
                 _ => (),
             },
             Ok(Event::Text(ref e)) => {
                 println!(
                     "text value: {}",
-                    e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
+                    e.decode_and_unescape_with(&reader, |ent| custom_entities
+                        .get(ent)
+                        .map(|s| s.as_str()))
                         .unwrap()
                 );
             }

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -618,14 +618,14 @@ where
         allow_start: bool,
     ) -> Result<BytesCData<'de>, DeError> {
         match self.next()? {
-            DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
+            DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
             DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
             DeEvent::CData(e) => Ok(e),
             DeEvent::Start(e) if allow_start => {
                 // allow one nested level
                 let inner = self.next()?;
                 let t = match inner {
-                    DeEvent::Text(t) if unescape => t.unescape()?,
+                    DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
                     DeEvent::Text(t) => BytesCData::new(t.into_inner()),
                     DeEvent::CData(t) => t,
                     DeEvent::Start(s) => {

diff --git a/src/escapei.rs b/src/escapei.rs
@@ -2,7 +2,6 @@
 
 use memchr;
 use std::borrow::Cow;
-use std::collections::HashMap;
 use std::ops::Range;
 
 #[cfg(test)]
@@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
 /// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
 /// corresponding xml escaped value.
 pub fn escape(raw: &[u8]) -> Cow<[u8]> {
-    #[inline]
-    fn to_escape(b: u8) -> bool {
-        match b {
-            b'<' | b'>' | b'\'' | b'&' | b'"' => true,
-            _ => false,
-        }
-    }
-
-    _escape(raw, to_escape)
+    _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
 }
 
 /// Should only be used for escaping text content. In xml text content, it is allowed
 /// (though not recommended) to leave the quote special characters " and ' unescaped.
 /// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
 /// their corresponding xml escaped value, but does not escape quote characters.
 pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
-    #[inline]
-    fn to_escape(b: u8) -> bool {
-        match b {
-            b'<' | b'>' | b'&' => true,
-            _ => false,
-        }
-    }
-
-    _escape(raw, to_escape)
+    _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
 }
 
 /// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -130,32 +113,23 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
 /// value
 pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
-    do_unescape(raw, None)
-}
-
-/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using a dictionnary of custom entities.
-///
-/// # Pre-condition
-///
-/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
-pub fn unescape_with<'a>(
-    raw: &'a [u8],
-    custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
-) -> Result<Cow<'a, [u8]>, EscapeError> {
-    do_unescape(raw, Some(custom_entities))
+    unescape_with(raw, |_| None)
 }
 
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using an optional dictionary of custom entities.
+/// value, using a resolver function for custom entities.
 ///
 /// # Pre-condition
 ///
-/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
-pub fn do_unescape<'a>(
-    raw: &'a [u8],
-    custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
-) -> Result<Cow<'a, [u8]>, EscapeError> {
+/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
+pub fn unescape_with<'input, 'entity, F>(
+    raw: &'input [u8],
+    resolve_entity: F,
+) -> Result<Cow<'input, [u8]>, EscapeError>
+where
+    // the lifetime of the output comes from a capture or is `'static`
+    F: Fn(&[u8]) -> Option<&'entity str>,
+{
     let mut unescaped = None;
     let mut last_end = 0;
     let mut iter = memchr::memchr2_iter(b'&', b';', raw);
@@ -171,12 +145,14 @@ pub fn do_unescape<'a>(
 
                 // search for character correctness
                 let pat = &raw[start + 1..end];
-                if let Some(s) = named_entity(pat) {
-                    unescaped.extend_from_slice(s.as_bytes());
-                } else if pat.starts_with(b"#") {
-                    push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
-                } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
-                    unescaped.extend_from_slice(&value);
+                if pat.starts_with(b"#") {
+                    let entity = &pat[1..]; // starts after the #
+                    let codepoint = parse_number(entity, start..end)?;
+                    push_utf8(unescaped, codepoint);
+                } else if let Some(value) = named_entity(pat) {
+                    unescaped.extend_from_slice(value.as_bytes());
+                } else if let Some(value) = resolve_entity(pat) {
+                    unescaped.extend_from_slice(value.as_bytes());
                 } else {
                     return Err(EscapeError::UnrecognizedSymbol(
                         start + 1..end,
@@ -1740,18 +1716,20 @@ fn test_unescape() {
 
 #[test]
 fn test_unescape_with() {
-    let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
-        .into_iter()
-        .collect();
-    assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
+    let custom_entities = |ent: &[u8]| match ent {
+        b"foo" => Some("BAR"),
+        _ => None,
+    };
+
+    assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
     assert_eq!(
-        &*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
+        &*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
         b"<test>"
     );
-    assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
-    assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
-    assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
-    assert!(unescape_with(b"&fop;", &custom_entities).is_err());
+    assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
+    assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
+    assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
+    assert!(unescape_with(b"&fop;", custom_entities).is_err());
 }
 
 #[test]