diff --git a/Changelog.md b/Changelog.md index cafe5fc8..0e88eeb8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -109,7 +109,15 @@ |`read_event_unbuffered` |`read_event` |`read_to_end_unbuffered` |`read_to_end` - [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>` - +- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to + replacement text. This avoids needing to allocate a map and provides the user with more flexibility. +- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*` + to better communicate their function. Renamed functions following the pattern `*_with_custom_entities` + to `decode_and_unescape_with` to be more consistent across the API. +- [#415]: `BytesText::escaped()` renamed to `BytesText::escape()`, `BytesText::unescaped()` renamed to + `BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`, + `Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()` + renamed to `Attribute::escape_value_with()` for consistency across the API. - [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method added to all events @@ -137,6 +145,7 @@ [#403]: https://github.com/tafia/quick-xml/pull/403 [#407]: https://github.com/tafia/quick-xml/pull/407 [#412]: https://github.com/tafia/quick-xml/pull/412 +[#415]: https://github.com/tafia/quick-xml/pull/415 [#416]: https://github.com/tafia/quick-xml/pull/416 [#418]: https://github.com/tafia/quick-xml/pull/418 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index e5b4211c..f9d75c92 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> { match r.read_event()? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.unescaped_value()?); + criterion::black_box(attr?.unescape_value()?); } } Event::Text(e) => { - criterion::black_box(e.unescaped()?); + criterion::black_box(e.unescape()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); diff --git a/benches/microbenches.rs b/benches/microbenches.rs index b9713e32..0524f3e0 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) { group.finish(); } -/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event` -/// benchmark) -fn bytes_text_unescaped(c: &mut Criterion) { - let mut group = c.benchmark_group("BytesText::unescaped"); - group.bench_function("trim_text = false", |b| { - b.iter(|| { - let mut buf = Vec::new(); - let mut r = Reader::from_reader(SAMPLE); - r.check_end_names(false).check_comments(false); - let mut count = criterion::black_box(0); - let mut nbtxt = criterion::black_box(0); - loop { - match r.read_event_into(&mut buf) { - Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), - Ok(Event::Eof) => break, - _ => (), - } - buf.clear(); - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - - // Windows has \r\n instead of \n - #[cfg(windows)] - assert_eq!( - nbtxt, 67661, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - - #[cfg(not(windows))] - assert_eq!( - nbtxt, 66277, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - }); - }); - - group.bench_function("trim_text = true", |b| { - b.iter(|| { - let mut buf = Vec::new(); - let mut r = Reader::from_reader(SAMPLE); - r.check_end_names(false) - .check_comments(false) - .trim_text(true); - let mut count = criterion::black_box(0); - let mut nbtxt = criterion::black_box(0); - loop { - match r.read_event_into(&mut buf) { - Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), - Ok(Event::Eof) => break, - _ => (), - } - buf.clear(); - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - - // Windows has \r\n instead of \n - #[cfg(windows)] - assert_eq!( - nbtxt, 50334, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - - #[cfg(not(windows))] - assert_eq!( - nbtxt, 50261, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - }); - }); - group.finish(); -} - /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); @@ -256,7 +176,7 @@ fn one_event(c: &mut Criterion) { .check_comments(false) .trim_text(true); match r.read_event_into(&mut buf) { - Ok(Event::Comment(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; @@ -473,7 +393,6 @@ purus. Consequat id porta nibh venenatis cras sed felis."; criterion_group!( benches, read_event, - bytes_text_unescaped, read_namespaced_event, one_event, attributes, diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index 02165faf..f563ea2b 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -7,10 +7,11 @@ //! * the regex in this example is simple but brittle; //! * it does not support the use of entities in entity declaration. +use std::collections::HashMap; + use quick_xml::events::Event; use quick_xml::Reader; use regex::bytes::Regex; -use std::collections::HashMap; const DATA: &str = r#" @@ -27,35 +28,41 @@ fn main() -> Result<(), Box> { reader.trim_text(true); let mut buf = Vec::new(); - let mut custom_entities = HashMap::new(); + let mut custom_entities: HashMap, String> = HashMap::new(); let entity_re = Regex::new(r#""#)?; loop { match reader.read_event_into(&mut buf) { Ok(Event::DocType(ref e)) => { for cap in entity_re.captures_iter(&e) { - custom_entities.insert(cap[1].to_vec(), cap[2].to_vec()); + custom_entities.insert( + cap[1].to_vec(), + reader.decoder().decode(&cap[2])?.into_owned(), + ); } } Ok(Event::Start(ref e)) => match e.name().as_ref() { - b"test" => println!( - "attributes values: {:?}", - e.attributes() - .map(|a| a - .unwrap() - .unescape_and_decode_value_with_custom_entities( - &reader, - &custom_entities - ) - .unwrap()) - .collect::>() - ), + b"test" => { + let attributes = e + .attributes() + .map(|a| { + a.unwrap() + .decode_and_unescape_value_with(&reader, |ent| { + custom_entities.get(ent).map(|s| s.as_str()) + }) + .unwrap() + }) + .collect::>(); + println!("attributes values: {:?}", attributes); + } _ => (), }, Ok(Event::Text(ref e)) => { println!( "text value: {}", - e.unescape_and_decode_with_custom_entities(&reader, &custom_entities) + e.decode_and_unescape_with(&reader, |ent| custom_entities + .get(ent) + .map(|s| s.as_str())) .unwrap() ); } diff --git a/src/de/mod.rs b/src/de/mod.rs index 85949d8a..ae94b260 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -618,14 +618,14 @@ where allow_start: bool, ) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into), + DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into), DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())), DeEvent::CData(e) => Ok(e), DeEvent::Start(e) if allow_start => { // allow one nested level let inner = self.next()?; let t = match inner { - DeEvent::Text(t) if unescape => t.unescape()?, + DeEvent::Text(t) if unescape => t.unescape_as_cdata()?, DeEvent::Text(t) => BytesCData::new(t.into_inner()), DeEvent::CData(t) => t, DeEvent::Start(s) => { diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..6af41e7f 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -2,7 +2,6 @@ use memchr; use std::borrow::Cow; -use std::collections::HashMap; use std::ops::Range; #[cfg(test)] @@ -66,15 +65,7 @@ impl std::error::Error for EscapeError {} /// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their /// corresponding xml escaped value. pub fn escape(raw: &[u8]) -> Cow<[u8]> { - #[inline] - fn to_escape(b: u8) -> bool { - match b { - b'<' | b'>' | b'\'' | b'&' | b'"' => true, - _ => false, - } - } - - _escape(raw, to_escape) + _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"')) } /// Should only be used for escaping text content. In xml text content, it is allowed @@ -82,15 +73,7 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> { /// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with /// their corresponding xml escaped value, but does not escape quote characters. pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> { - #[inline] - fn to_escape(b: u8) -> bool { - match b { - b'<' | b'>' | b'&' => true, - _ => false, - } - } - - _escape(raw, to_escape) + _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&')) } /// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their @@ -130,32 +113,23 @@ fn _escape bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> { /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding /// value pub fn unescape(raw: &[u8]) -> Result, EscapeError> { - do_unescape(raw, None) -} - -/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. -/// -/// # Pre-condition -/// -/// The keys and values of `custom_entities`, if any, must be valid UTF-8. -pub fn unescape_with<'a>( - raw: &'a [u8], - custom_entities: &HashMap, Vec>, -) -> Result, EscapeError> { - do_unescape(raw, Some(custom_entities)) + unescape_with(raw, |_| None) } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using an optional dictionary of custom entities. +/// value, using a resolver function for custom entities. /// /// # Pre-condition /// -/// The keys and values of `custom_entities`, if any, must be valid UTF-8. -pub fn do_unescape<'a>( - raw: &'a [u8], - custom_entities: Option<&HashMap, Vec>>, -) -> Result, EscapeError> { +/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. +pub fn unescape_with<'input, 'entity, F>( + raw: &'input [u8], + resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&[u8]) -> Option<&'entity str>, +{ let mut unescaped = None; let mut last_end = 0; let mut iter = memchr::memchr2_iter(b'&', b';', raw); @@ -171,12 +145,14 @@ pub fn do_unescape<'a>( // search for character correctness let pat = &raw[start + 1..end]; - if let Some(s) = named_entity(pat) { - unescaped.extend_from_slice(s.as_bytes()); - } else if pat.starts_with(b"#") { - push_utf8(unescaped, parse_number(&pat[1..], start..end)?); - } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) { - unescaped.extend_from_slice(&value); + if pat.starts_with(b"#") { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity, start..end)?; + push_utf8(unescaped, codepoint); + } else if let Some(value) = named_entity(pat) { + unescaped.extend_from_slice(value.as_bytes()); + } else if let Some(value) = resolve_entity(pat) { + unescaped.extend_from_slice(value.as_bytes()); } else { return Err(EscapeError::UnrecognizedSymbol( start + 1..end, @@ -1740,18 +1716,20 @@ fn test_unescape() { #[test] fn test_unescape_with() { - let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())] - .into_iter() - .collect(); - assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test"); + let custom_entities = |ent: &[u8]| match ent { + b"foo" => Some("BAR"), + _ => None, + }; + + assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test"); assert_eq!( - &*unescape_with(b"<test>", &custom_entities).unwrap(), + &*unescape_with(b"<test>", custom_entities).unwrap(), b"" ); - assert_eq!(&*unescape_with(b"0", &custom_entities).unwrap(), b"0"); - assert_eq!(&*unescape_with(b"0", &custom_entities).unwrap(), b"0"); - assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR"); - assert!(unescape_with(b"&fop;", &custom_entities).is_err()); + assert_eq!(&*unescape_with(b"0", custom_entities).unwrap(), b"0"); + assert_eq!(&*unescape_with(b"0", custom_entities).unwrap(), b"0"); + assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR"); + assert!(unescape_with(b"&fop;", custom_entities).is_err()); } #[test] diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 4910527e..12715866 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -2,23 +2,23 @@ //! //! Provides an iterator over attributes key/value pairs -use crate::errors::{Error, Result as XmlResult}; -use crate::escape::{do_unescape, escape}; +use crate::errors::Result as XmlResult; +use crate::escape::{escape, unescape_with}; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; use std::fmt::{self, Debug, Display, Formatter}; use std::iter::FusedIterator; -use std::{borrow::Cow, collections::HashMap, ops::Range}; +use std::{borrow::Cow, ops::Range}; /// A struct representing a key/value XML attribute. /// /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely -/// want to access the value using one of the [`unescaped_value`] and [`unescape_and_decode_value`] +/// want to access the value using one of the [`unescape_value`] and [`decode_and_unescape_value`] /// functions. /// -/// [`unescaped_value`]: Self::unescaped_value -/// [`unescape_and_decode_value`]: Self::unescape_and_decode_value +/// [`unescape_value`]: Self::unescape_value +/// [`decode_and_unescape_value`]: Self::decode_and_unescape_value #[derive(Clone, PartialEq)] pub struct Attribute<'a> { /// The key to uniquely define the attribute. @@ -37,83 +37,67 @@ impl<'a> Attribute<'a> { /// /// This will allocate if the value contains any escape sequences. /// - /// See also [`unescaped_value_with_custom_entities()`](Self::unescaped_value_with_custom_entities) - pub fn unescaped_value(&self) -> XmlResult> { - self.make_unescaped_value(None) + /// See also [`unescape_value_with()`](Self::unescape_value_with) + pub fn unescape_value(&self) -> XmlResult> { + self.unescape_value_with(|_| None) } /// Returns the unescaped value, using custom entities. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. - /// Additional entities can be provided in `custom_entities`. + /// A fallback resolver for additional custom entities can be provided via + /// `resolve_entity`. /// /// This will allocate if the value contains any escape sequences. /// - /// See also [`unescaped_value()`](Self::unescaped_value) + /// See also [`unescape_value()`](Self::unescape_value) /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - pub fn unescaped_value_with_custom_entities( - &self, - custom_entities: &HashMap, Vec>, - ) -> XmlResult> { - self.make_unescaped_value(Some(custom_entities)) + /// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. + pub fn unescape_value_with<'s, 'entity>( + &'s self, + resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>, + ) -> XmlResult> { + Ok(unescape_with(&*self.value, resolve_entity)?) } - fn make_unescaped_value( - &self, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult> { - do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError) - } - - /// Decode then unescapes the value + /// Decodes then unescapes the value /// /// This allocates a `String` in all cases. For performance reasons it might be a better idea to /// instead use one of: /// /// * [`Reader::decoder().decode()`], as it only allocates when the decoding can't be performed otherwise. - /// * [`unescaped_value()`], as it doesn't allocate when no escape sequences are used. + /// * [`unescape_value()`], as it doesn't allocate when no escape sequences are used. /// - /// [`unescaped_value()`]: Self::unescaped_value + /// [`unescape_value()`]: Self::unescape_value /// [`Reader::decoder().decode()`]: crate::reader::Decoder::decode - pub fn unescape_and_decode_value(&self, reader: &Reader) -> XmlResult { - self.do_unescape_and_decode_value(reader, None) + pub fn decode_and_unescape_value(&self, reader: &Reader) -> XmlResult { + self.decode_and_unescape_value_with(reader, |_| None) } - /// Decode then unescapes the value with custom entities + /// Decodes then unescapes the value with custom entities /// /// This allocates a `String` in all cases. For performance reasons it might be a better idea to /// instead use one of: /// /// * [`Reader::decoder().decode()`], as it only allocates when the decoding can't be performed otherwise. - /// * [`unescaped_value_with_custom_entities()`], as it doesn't allocate when no escape sequences are used. + /// * [`unescape_value_with()`], as it doesn't allocate when no escape sequences are used. /// - /// [`unescaped_value_with_custom_entities()`]: Self::unescaped_value_with_custom_entities + /// [`unescape_value_with()`]: Self::unescape_value_with /// [`Reader::decoder().decode()`]: crate::reader::Decoder::decode /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - pub fn unescape_and_decode_value_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_value(reader, Some(custom_entities)) - } - - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - fn do_unescape_and_decode_value( + /// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. + pub fn decode_and_unescape_value_with<'entity, B>( &self, reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, + resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>, ) -> XmlResult { let decoded = reader.decoder().decode(&*self.value)?; - - let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + let unescaped = unescape_with(decoded.as_bytes(), resolve_entity)?; Ok(String::from_utf8(unescaped.into_owned())?) } } diff --git a/src/events/mod.rs b/src/events/mod.rs index 24d9f6f1..74540957 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -37,13 +37,12 @@ pub mod attributes; #[cfg(feature = "encoding")] use encoding_rs::Encoding; use std::borrow::Cow; -use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::ops::Deref; use std::str::from_utf8; use crate::errors::{Error, Result}; -use crate::escape::{do_unescape, escape, partial_escape}; +use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; use crate::reader::{Decoder, Reader}; use crate::utils::write_cow_string; @@ -736,14 +735,16 @@ impl<'a> BytesText<'a> { /// Returns unescaped version of the text content, that can be written /// as CDATA in XML #[cfg(feature = "serialize")] - pub(crate) fn unescape(self) -> std::result::Result, EscapeError> { + pub(crate) fn unescape_as_cdata(self) -> std::result::Result, EscapeError> { //TODO: need to think about better API instead of dozens similar functions // Maybe use builder pattern. After that expose function as public API //FIXME: need to take into account entities defined in the document - Ok(BytesCData::new(match do_unescape(&self.content, None)? { - Cow::Borrowed(_) => self.content, - Cow::Owned(unescaped) => Cow::Owned(unescaped), - })) + Ok(BytesCData::new( + match unescape_with(&self.content, |_| None)? { + Cow::Borrowed(_) => self.content, + Cow::Owned(unescaped) => Cow::Owned(unescaped), + }, + )) } /// gets escaped content @@ -751,77 +752,62 @@ impl<'a> BytesText<'a> { /// Searches for '&' into content and try to escape the coded character if possible /// returns Malformed error with index within element if '&' is not followed by ';' /// - /// See also [`unescaped_with_custom_entities()`](Self::unescaped_with_custom_entities) - pub fn unescaped(&self) -> Result> { - self.make_unescaped(None) + /// See also [`unescape_with()`](Self::unescape_with) + pub fn unescape(&self) -> Result> { + self.unescape_with(|_| None) } /// gets escaped content with custom entities /// /// Searches for '&' into content and try to escape the coded character if possible /// returns Malformed error with index within element if '&' is not followed by ';' - /// Additional entities can be provided in `custom_entities`. + /// A fallback resolver for additional custom entities can be provided via `resolve_entity`. /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. + /// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. /// - /// See also [`unescaped()`](Self::unescaped) - pub fn unescaped_with_custom_entities<'s>( - &'s self, - custom_entities: &HashMap, Vec>, - ) -> Result> { - self.make_unescaped(Some(custom_entities)) - } - - fn make_unescaped<'s>( + /// See also [`unescape()`](Self::unescape) + pub fn unescape_with<'s, 'entity>( &'s self, - custom_entities: Option<&HashMap, Vec>>, + resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>, ) -> Result> { - do_unescape(self, custom_entities).map_err(Error::EscapeError) + Ok(unescape_with(self, resolve_entity)?) } /// helper method to unescape then decode self using the reader encoding /// /// for performance reasons (could avoid allocating a `String`), /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - pub fn unescape_and_decode(&self, reader: &Reader) -> Result { - self.do_unescape_and_decode_with_custom_entities(reader, None) + /// 1. Reader::decode(...) + /// 2. BytesText::unescaped() + pub fn decode_and_unescape(&self, reader: &Reader) -> Result { + self.decode_and_unescape_with(reader, |_| None) } /// helper method to unescape then decode self using the reader encoding with custom entities /// /// for performance reasons (could avoid allocating a `String`), /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) + /// 1. Reader::decode(...) + /// 2. BytesText::unescaped() /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - pub fn unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) - } - - fn do_unescape_and_decode_with_custom_entities( + /// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. + pub fn decode_and_unescape_with<'entity, B>( &self, reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, + resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>, ) -> Result { let decoded = reader.decoder().decode(&*self)?; - let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + let unescaped = unescape_with(decoded.as_bytes(), resolve_entity)?; Ok(String::from_utf8(unescaped.into_owned())?) } /// Gets escaped content. - pub fn escaped(&self) -> &[u8] { + pub fn escape(&self) -> &[u8] { self.content.as_ref() } } diff --git a/src/lib.rs b/src/lib.rs index cebc401d..57c6f9cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,7 +59,7 @@ //! } //! }, //! // unescape and decode the text event using the reader encoding -//! Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()), +//! Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap()), //! Ok(Event::Eof) => break, // exits the loop when reaching end of file //! Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), //! _ => (), // There are several other `Event`s we do not consider here @@ -139,7 +139,7 @@ mod errors; mod escapei; pub mod escape { //! Manage xml character escapes - pub(crate) use crate::escapei::{do_unescape, EscapeError}; + pub(crate) use crate::escapei::EscapeError; pub use crate::escapei::{escape, partial_escape, unescape, unescape_with}; } pub mod events; diff --git a/src/reader.rs b/src/reader.rs index 9a967fba..b2ac9464 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -135,7 +135,7 @@ impl EncodingRef { /// _ => (), /// } /// }, -/// Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()), +/// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap()), /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), /// Ok(Event::Eof) => break, /// _ => (), @@ -496,7 +496,7 @@ impl Reader { /// loop { /// match reader.read_event_into(&mut buf) { /// Ok(Event::Start(ref e)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).expect("Error!")), + /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).expect("Error!")), /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), /// Ok(Event::Eof) => break, /// _ => (), @@ -549,7 +549,7 @@ impl Reader { /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) /// } /// Ok((_, Event::Text(e))) => { - /// txt.push(e.unescape_and_decode(&reader).expect("Error!")) + /// txt.push(e.decode_and_unescape(&reader).expect("Error!")) /// }, /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), /// Ok((_, Event::Eof)) => break, @@ -750,7 +750,7 @@ impl Reader { let s = match self.read_event_into(buf) { Err(e) => return Err(e), - Ok(Event::Text(e)) => e.unescape_and_decode(self), + Ok(Event::Text(e)) => e.decode_and_unescape(self), Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), _ => return Err(Error::TextNotFound), diff --git a/src/writer.rs b/src/writer.rs index 7c5cf307..ca29954d 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -108,7 +108,7 @@ impl Writer { Event::Empty(ref e) => self.write_wrapped(b"<", e, b"/>"), Event::Text(ref e) => { next_should_line_break = false; - self.write(&e.escaped()) + self.write(&e.escape()) } Event::Comment(ref e) => self.write_wrapped(b""), Event::CData(ref e) => { diff --git a/tests/test.rs b/tests/test.rs index f5a91dc7..162a8bd4 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -104,7 +104,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event_into(&mut buf) { Ok(Text(e)) => { - e.unescape_and_decode(&r).unwrap(); + e.decode_and_unescape(&r).unwrap(); } Ok(Eof) => break, _ => (), @@ -157,13 +157,13 @@ fn fuzz_101() { match reader.read_event_into(&mut buf) { Ok(Start(ref e)) | Ok(Empty(ref e)) => { for a in e.attributes() { - if a.ok().map_or(true, |a| a.unescaped_value().is_err()) { + if a.ok().map_or(true, |a| a.unescape_value().is_err()) { break; } } } Ok(Text(ref e)) => { - if e.unescaped().is_err() { + if e.unescape().is_err() { break; } } diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index fc4cb98f..6c6a0615 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -523,7 +523,7 @@ fn test_escaped_content() { "content unexpected: expecting '<test>', got '{:?}'", from_utf8(&*e) ); - match e.unescaped() { + match e.unescape() { Ok(ref c) => assert_eq!( &**c, b"", @@ -620,7 +620,7 @@ fn test_read_write_roundtrip_escape() -> Result<()> { match reader.read_event_into(&mut buf)? { Eof => break, Text(e) => { - let t = e.escaped(); + let t = e.escape(); assert!(writer .write_event(Text(BytesText::from_escaped(t.to_vec()))) .is_ok()); @@ -653,7 +653,7 @@ fn test_read_write_roundtrip_escape_text() -> Result<()> { match reader.read_event_into(&mut buf)? { Eof => break, Text(e) => { - let t = e.unescape_and_decode(&reader).unwrap(); + let t = e.decode_and_unescape(&reader).unwrap(); assert!(writer .write_event(Text(BytesText::from_plain_str(&t))) .is_ok()); diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 35ce90f3..8eaa66c1 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -459,7 +459,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> Ok((_, Event::CData(e))) => format!("CData({})", decoder.decode(&e).unwrap()), Ok((_, Event::Text(e))) => match unescape(decoder.decode(&e).unwrap().as_bytes()) { Ok(c) => format!("Characters({})", from_utf8(c.as_ref()).unwrap()), - Err(err) => format!("FailedUnescape({:?}; {})", e.escaped(), err), + Err(err) => format!("FailedUnescape({:?}; {})", e.escape(), err), }, Ok((_, Event::Decl(e))) => { let version_cow = e.version().unwrap();