Skip to content

Commit

Permalink
Change API for providing custom entities
Browse files Browse the repository at this point in the history
Instead of providing unescaping functions with an entity mapping
via a data structure, instead provide a closure which maps the entity
with replacement text.
  • Loading branch information
dralley committed Jul 10, 2022
1 parent 57cd104 commit 657a8f6
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 124 deletions.
3 changes: 3 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.

### New Tests

Expand All @@ -131,6 +133,7 @@
[#403]: https://github.com/tafia/quick-xml/pull/403
[#407]: https://github.com/tafia/quick-xml/pull/407
[#412]: https://github.com/tafia/quick-xml/pull/412
[#415]: https://github.com/tafia/quick-xml/pull/415

## 0.23.0 -- 2022-05-08

Expand Down
37 changes: 21 additions & 16 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
//! * the regex in this example is simple but brittle;
//! * it does not support the use of entities in entity declaration.

use std::collections::HashMap;

use quick_xml::events::Event;
use quick_xml::Reader;
use regex::bytes::Regex;
use std::collections::HashMap;

const DATA: &str = r#"
Expand All @@ -27,35 +28,39 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities = HashMap::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
custom_entities.insert(cap[1].to_vec(), String::from_utf8(cap[2].to_vec())?);
}
}
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"test" => println!(
"attributes values: {:?}",
e.attributes()
.map(|a| a
.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
&custom_entities
)
.unwrap())
.collect::<Vec<_>>()
),
b"test" => {
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
let attributes = e
.attributes()
.map(|a| {
a.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
lookup_custom_entity,
)
.unwrap()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
println!(
"text value: {}",
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
e.unescape_and_decode_with_custom_entities(&reader, lookup_custom_entity)
.unwrap()
);
}
Expand Down
83 changes: 30 additions & 53 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

#[cfg(test)]
Expand Down Expand Up @@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -130,32 +113,22 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
do_unescape(raw, None)
unescape_with(raw, |_| None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
/// value, using a dictionary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_with<'a>(
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_with<'a, 'b>(
raw: &'a [u8],
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
do_unescape(raw, Some(custom_entities))
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using an optional dictionary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn do_unescape<'a>(
raw: &'a [u8],
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
lookup_custom_entity: impl Fn(&'b [u8]) -> Option<&'b str>,
) -> Result<Cow<'a, [u8]>, EscapeError>
where
'a: 'b,
{
let mut unescaped = None;
let mut last_end = 0;
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
Expand All @@ -171,12 +144,14 @@ pub fn do_unescape<'a>(

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
if pat.starts_with(b"#") {
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else if let Some(value) = lookup_custom_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else {
return Err(EscapeError::UnrecognizedSymbol(
start + 1..end,
Expand Down Expand Up @@ -1740,18 +1715,20 @@ fn test_unescape() {

#[test]
fn test_unescape_with() {
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
.into_iter()
.collect();
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
let custom_entities = |ent: &[u8]| match ent {
b"foo" => Some("BAR"),
_ => None,
};

assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
assert_eq!(
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
b"<test>"
);
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", custom_entities).is_err());
}

#[test]
Expand Down
39 changes: 11 additions & 28 deletions src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
//! Provides an iterator over attributes key/value pairs

use crate::errors::{Error, Result as XmlResult};
use crate::escape::{do_unescape, escape};
use crate::escape::{unescape_with, escape};
use crate::name::QName;
use crate::reader::{is_whitespace, Reader};
use crate::utils::{write_byte_string, write_cow_string, Bytes};
use std::fmt::{self, Debug, Display, Formatter};
use std::iter::FusedIterator;
use std::{borrow::Cow, collections::HashMap, ops::Range};
use std::{borrow::Cow, ops::Range};

/// A struct representing a key/value XML attribute.
///
Expand Down Expand Up @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
///
/// See also [`unescaped_value_with_custom_entities()`](#method.unescaped_value_with_custom_entities)
pub fn unescaped_value(&self) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(None)
self.unescaped_value_with_custom_entities(|_| None)
}

/// Returns the unescaped value, using custom entities.
Expand All @@ -57,18 +57,11 @@ impl<'a> Attribute<'a> {
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescaped_value_with_custom_entities(
&self,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(Some(custom_entities))
}

fn make_unescaped_value(
&self,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<Cow<[u8]>> {
do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
pub fn unescaped_value_with_custom_entities<'s>(
&'s self,
lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>,
) -> XmlResult<Cow<'s, [u8]>> {
unescape_with(&*self.value, lookup_custom_entity).map_err(Error::EscapeError)
}

/// Decode then unescapes the value
Expand All @@ -82,7 +75,7 @@ impl<'a> Attribute<'a> {
/// [`unescaped_value()`]: #method.unescaped_value
/// [`Reader::decode()`]: ../../reader/struct.Reader.html#method.decode
pub fn unescape_and_decode_value<B>(&self, reader: &Reader<B>) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, None)
self.unescape_and_decode_value_with_custom_entities(reader, |_| None)
}

/// Decode then unescapes the value with custom entities
Expand All @@ -102,20 +95,10 @@ impl<'a> Attribute<'a> {
pub fn unescape_and_decode_value_with_custom_entities<B>(
&self,
reader: &Reader<B>,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, Some(custom_entities))
}

/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
fn do_unescape_and_decode_value<B>(
&self,
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>,
) -> XmlResult<String> {
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
let unescaped = unescape_with(decoded.as_bytes(), lookup_custom_entity)?;
Ok(String::from_utf8(unescaped.into_owned())?)
}
}
Expand Down
36 changes: 10 additions & 26 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,12 @@ pub mod attributes;
#[cfg(feature = "encoding_rs")]
use encoding_rs::Encoding;
use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt::{self, Debug, Formatter};
use std::ops::Deref;
use std::str::from_utf8;

use crate::errors::{Error, Result};
use crate::escape::{do_unescape, escape, partial_escape};
use crate::escape::{escape, partial_escape, unescape_with};
use crate::name::{LocalName, QName};
use crate::reader::{Decoder, Reader};
use crate::utils::write_cow_string;
Expand Down Expand Up @@ -707,7 +706,7 @@ impl<'a> BytesText<'a> {
//TODO: need to think about better API instead of dozens similar functions
// Maybe use builder pattern. After that expose function as public API
//FIXME: need to take into account entities defined in the document
Ok(BytesCData::new(match do_unescape(&self.content, None)? {
Ok(BytesCData::new(match unescape_with(&self.content, |_| None)? {
Cow::Borrowed(_) => self.content,
Cow::Owned(unescaped) => Cow::Owned(unescaped),
}))
Expand All @@ -720,7 +719,7 @@ impl<'a> BytesText<'a> {
///
/// See also [`unescaped_with_custom_entities()`](#method.unescaped_with_custom_entities)
pub fn unescaped(&self) -> Result<Cow<[u8]>> {
self.make_unescaped(None)
self.unescaped_with_custom_entities(|_| None)
}

/// gets escaped content with custom entities
Expand All @@ -731,21 +730,14 @@ impl<'a> BytesText<'a> {
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
///
/// See also [`unescaped()`](#method.unescaped)
pub fn unescaped_with_custom_entities<'s>(
&'s self,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
lookup_custom_entities: impl Fn(&[u8]) -> Option<&str>,
) -> Result<Cow<'s, [u8]>> {
self.make_unescaped(Some(custom_entities))
}

fn make_unescaped<'s>(
&'s self,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'s, [u8]>> {
do_unescape(self, custom_entities).map_err(Error::EscapeError)
unescape_with(self, lookup_custom_entities).map_err(Error::EscapeError)
}

/// helper method to unescape then decode self using the reader encoding
Expand All @@ -755,7 +747,7 @@ impl<'a> BytesText<'a> {
/// 1. BytesText::unescaped()
/// 2. Reader::decode(...)
pub fn unescape_and_decode<B>(&self, reader: &Reader<B>) -> Result<String> {
self.do_unescape_and_decode_with_custom_entities(reader, None)
self.unescape_and_decode_with_custom_entities(reader, |_| None)
}

/// helper method to unescape then decode self using the reader encoding with custom entities
Expand All @@ -767,23 +759,15 @@ impl<'a> BytesText<'a> {
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_and_decode_with_custom_entities<B>(
&self,
reader: &Reader<B>,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<String> {
self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities))
}

fn do_unescape_and_decode_with_custom_entities<B>(
&self,
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
lookup_custom_entities: impl Fn(&[u8]) -> Option<&str>,
) -> Result<String> {
let decoded = reader.decoder().decode(&*self)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
let unescaped = unescape_with(decoded.as_bytes(), lookup_custom_entities)?;
Ok(String::from_utf8(unescaped.into_owned())?)
}

Expand Down
Loading

0 comments on commit 657a8f6

Please sign in to comment.