Skip to content

Commit

Permalink
add serializer for binary xml text + tweak binary deserializer
Browse files Browse the repository at this point in the history
Added reading the config to determine trimming options.
  • Loading branch information
elrnv committed Jul 21, 2024
1 parent 47e2a69 commit 56246ea
Show file tree
Hide file tree
Showing 11 changed files with 4,118 additions and 1,764 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ serde = { version = ">=1.0.139", optional = true }
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
memchr = "2.1"
arbitrary = { version = "1", features = ["derive"], optional = true }
ref-cast = "1"

[dev-dependencies]
criterion = "0.4"
Expand Down
72 changes: 61 additions & 11 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2005,7 +2005,7 @@ use crate::{
errors::Error,
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
name::QName,
reader::Reader,
reader::{Config, Reader},
};
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
use std::borrow::Cow;
Expand Down Expand Up @@ -2169,6 +2169,31 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve
entity_resolver: E,
}

fn trim_cow<'a, F>(value: Cow<'a, str>, trim: F) -> Cow<'a, str>
where
F: FnOnce(&str) -> &str,
{
match value {
Cow::Borrowed(bytes) => Cow::Borrowed(trim(bytes)),
Cow::Owned(mut bytes) => {
let trimmed = trim(&bytes);
if trimmed.len() != bytes.len() {
bytes = trimmed.to_string();
}
Cow::Owned(bytes)
}
}
}

/// Removes trailing XML whitespace bytes from text content.
///
/// Returns `true` if content is empty after that
fn inplace_trim_end(mut s: &mut Cow<str>) -> bool {
let c: Cow<str> = replace(&mut s, Cow::Borrowed(""));
*s = trim_cow(c, str::trim_end);
s.is_empty()
}

impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
fn new(mut reader: R, entity_resolver: E) -> Self {
// Lookahead by one event immediately, so we do not need to check in the
Expand Down Expand Up @@ -2206,20 +2231,23 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
/// Read all consequent [`Text`] and [`CData`] events until non-text event
/// occurs. Content of all events would be appended to `result` and returned
/// as [`DeEvent::Text`].
///
/// If the resulting text empty, this function returns None to avoid creating an empty Event.
///
/// [`Text`]: PayloadEvent::Text
/// [`CData`]: PayloadEvent::CData
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<DeEvent<'i>, DeError> {
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<Option<DeEvent<'i>>, DeError> {
loop {
if self.current_event_is_last_text() {
break;
}

match self.next_impl()? {
PayloadEvent::Text(mut e) => {
if self.current_event_is_last_text() {
// FIXME: Actually, we should trim after decoding text, but now we trim before
e.inplace_trim_end();
if self.reader.config().trim_text_end {
e.inplace_trim_end();
}
}
result
.to_mut()
Expand All @@ -2228,10 +2256,12 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?),

// SAFETY: current_event_is_last_text checks that event is Text or CData
_ => unreachable!("Only `Text` and `CData` events can come here"),
e => {
unreachable!("Only `Text` and `CData` events can come here: {:?}", &e);
}
}
}
Ok(DeEvent::Text(Text { text: result }))
Ok(Some(DeEvent::Text(Text { text: result })))
}

/// Return an input-borrowing event.
Expand All @@ -2241,17 +2271,24 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
PayloadEvent::Start(e) => Ok(DeEvent::Start(e)),
PayloadEvent::End(e) => Ok(DeEvent::End(e)),
PayloadEvent::Text(mut e) => {
if self.current_event_is_last_text() && e.inplace_trim_end() {
// FIXME: Actually, we should trim after decoding text, but now we trim before
continue;
if self.current_event_is_last_text() {
if self.reader.config().trim_text_end && e.inplace_trim_end() {
continue;
}
}

match e.unescape_with(|entity| self.entity_resolver.resolve(entity)).map(|res| self.drain_text(res)) {
Ok(x) => x,
Ok(Ok(None)) => continue,
Ok(Ok(Some(x))) => Ok(x),
Ok(Err(x)) => Err(x),
// failed to escape treat as binary blob.
Err(_) => Ok(DeEvent::Binary(Binary { text: e.into_inner() })),
}
}
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
PayloadEvent::CData(e) => match self.drain_text(e.decode()?).transpose() {
None => continue,
Some(x) => x,
},
PayloadEvent::DocType(e) => {
self.entity_resolver
.capture(e)
Expand Down Expand Up @@ -2834,6 +2871,8 @@ where
pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self {
let mut reader = Reader::from_str(source);
let config = reader.config_mut();
config.trim_text_start = true;
config.trim_text_end = true;
config.expand_empty_elements = true;

Self::new(
Expand Down Expand Up @@ -3135,6 +3174,9 @@ pub trait XmlRead<'i> {

/// A copy of the reader's decoder used to decode strings.
fn decoder(&self) -> Decoder;

/// Returns a reference to the reader config.
fn config(&self) -> &Config;
}

/// XML input source that reads from a std::io input stream.
Expand Down Expand Up @@ -3204,6 +3246,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
fn decoder(&self) -> Decoder {
self.reader.decoder()
}

fn config(&self) -> &Config{
self.reader.config()
}
}

/// XML input source that reads from a slice of bytes and can borrow from it.
Expand Down Expand Up @@ -3269,6 +3315,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
fn decoder(&self) -> Decoder {
self.reader.decoder()
}

fn config(&self) -> &Config {
self.reader.config()
}
}

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion src/se/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ macro_rules! write_primitive {
/// with indent, sequence of strings become one big string with additional content
/// and it would be impossible to distinguish between content of the original
/// strings and inserted indent characters.
pub struct ContentSerializer<'w, 'i, W: Write> {
pub struct ContentSerializer<'w, 'i, W> {
pub writer: &'w mut W,
/// Defines which XML characters need to be escaped in text content
pub level: QuoteLevel,
Expand Down
2 changes: 1 addition & 1 deletion src/se/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ macro_rules! write_primitive {
/// - other variants are not supported ([`DeError::Unsupported`] is returned);
///
/// Usage of empty tags depends on the [`ContentSerializer::expand_empty_elements`] setting.
pub struct ElementSerializer<'w, 'k, W: Write> {
pub struct ElementSerializer<'w, 'k, W> {
/// The inner serializer that contains the settings and mostly do the actual work
pub ser: ContentSerializer<'w, 'k, W>,
/// Tag name used to wrap serialized types except enum variants which uses the variant name
Expand Down
Loading

0 comments on commit 56246ea

Please sign in to comment.