Skip to content

Commit

Permalink
Fix #1804: remove <!doctype html> and accept invalid HTML
Browse files Browse the repository at this point in the history
This fixes #1804 in two ways: First, it removes a <!doctype html> from
the start of the mail, if there is any.

Then, it parses the html itself it quick-xml fails, just stripping
everything between < and >.

Both of these would have fixed this specific issue.

Also, add tests for both fixes.
  • Loading branch information
Hocuri authored and hpk42 committed Aug 19, 2020
1 parent f1ec1a0 commit 1a736ca
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 1 deletion.
22 changes: 22 additions & 0 deletions src/dc_receive_imf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2623,4 +2623,26 @@ mod tests {
);
assert_eq!(last_msg.from_id, DC_CONTACT_ID_INFO);
}

#[async_std::test]
async fn test_html_only_mail() {
let t = TestContext::new_alice().await;
t.ctx
.set_config(Config::ShowEmails, Some("2"))
.await
.unwrap();
dc_receive_imf(
&t.ctx,
include_bytes!("../test-data/message/wrong-html.eml"),
"INBOX",
0,
false,
)
.await
.unwrap();
let chats = Chatlist::try_load(&t.ctx, 0, None, None).await.unwrap();
let msg_id = chats.get_msg_id(0).unwrap();
let msg = Message::load_from_db(&t.ctx, msg_id).await.unwrap();
assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut ä... \n\n MfG [...]");
}
}
50 changes: 49 additions & 1 deletion src/dehtml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,19 @@ enum AddText {
// dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as
// the newlines are typically removed in further processing by the caller
pub fn dehtml(buf: &str) -> String {
let buf = buf.trim();
let s = dehtml_quick_xml(buf);
if !s.trim().is_empty() {
return s;
}
let s = dehtml_manually(buf);
if !s.trim().is_empty() {
return s;
}
buf.to_string()
}

pub fn dehtml_quick_xml(buf: &str) -> String {
let buf = buf.trim().trim_start_matches("<!doctype html>");

let mut dehtml = Dehtml {
strbuilder: String::with_capacity(buf.len()),
Expand Down Expand Up @@ -171,6 +183,24 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
}
}

pub fn dehtml_manually(buf: &str) -> String {
// Just strip out everything between "<" and ">"
let mut strbuilder = String::new();
let mut show_next_chars = true;
for c in buf.chars() {
match c {
'<' => show_next_chars = false,
'>' => show_next_chars = true,
_ => {
if show_next_chars {
strbuilder.push(c)
}
}
}
}
strbuilder
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -254,4 +284,22 @@ mod tests {
let txt = dehtml(input);
assert_eq!(txt.trim(), "lots of text");
}

#[test]
fn test_doctype_html() {
use crate::simplify::simplify;

let input = "<!doctype html>\n<b>fat text</b>";
let txt = simplify(dehtml(input), false).0;
assert_eq!(txt.trim(), "*fat text*");

let input = "<!some invalid html code>\n<b>some text</b>";
let txt = simplify(dehtml(input), false).0;
assert_eq!(txt.trim(), "some text");
// at least DC should show the text if the html is invalid

let input = "<This text is in brackets>";
let txt = simplify(dehtml(input), false).0;
assert_eq!(txt.trim(), "<This text is in brackets>");
}
}
114 changes: 114 additions & 0 deletions test-data/message/wrong-html.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
Return-Path: <[email protected]>
X-Original-To: [email protected]
Delivered-To: [email protected]
Received: from mout.kundenserver.de (mout.kundenserver.de [212.227.126.131])
by dd37930.kasserver.com (Postfix) with ESMTPS id 271F34B4258C
for <[email protected]>; Thu, 6 Aug 2020 18:40:32 +0200 (CEST)
Received: from oxbsltgw18.schlund.de ([172.19.249.35]) by
mrelayeu.kundenserver.de (mreue009 [213.165.67.103]) with ESMTPSA (Nemesis)
id 1MpDRv-1kW93Y0lGZ-00qjvh for <[email protected]>; Thu, 06 Aug 2020
18:40:31 +0200
Date: Thu, 6 Aug 2020 18:40:30 +0200 (CEST)
From: Camping <[email protected]>
Reply-To: Camping <[email protected]>
To: Alice <[email protected]>
Message-ID: <[email protected]>
Subject: Re: subj?
MIME-Version: 1.0
Content-Type: multipart/related;
boundary="----=_Part_1287438_2124736777.1596732031007"
X-Mailer: Open-Xchange Mailer v7.10.1-Rev32
X-Originating-Client: open-xchange-appsuite

------=_Part_1287438_2124736777.1596732031007
MIME-Version: 1.0
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable

<!doctype html>
<html>
<head>=20
<meta charset=3D"UTF-8">=20
</head>
<body>
<div class=3D"default-style">
Guten Abend,
<br>
</div>
<div class=3D"default-style">
<br>
</div>
<div class=3D"default-style">
Lots of text
<br>
</div>
<div class=3D"default-style">
text with Umlaut =C3=A4...
<br>
</div>
<div class=3D"default-style">
MfG
<br>
</div>
<div class=3D"default-style">
<br>
</div>
<div class=3D"io-ox-signature">
<p>--------------------------------------<br></p>
<p style=3D"text-align: left;"><a href=3D"https://example.com=
/">Camping </a><br></p>
<p>someaddress<br></p>
<p>sometown</p>
<p><img alt=3D"" class=3D"aspect-ratio" style=3D"width: 505px; height: 1=
68px; max-width: 100%;" id=3D"d5cd260f-1b1f-4bfa-81d7-c0a74923b979" src=3D"=
cid:d5cd260f-1b1f-4bfa-81d7-c0a74923b979" width=3D"505" height=3D"168"><br>=
</p>
</div>
<blockquote type=3D"cite">
<div>
Le 5 ao=C3=BBt 2020 =C3=A0 10:46, holger &lt;
<a href=3D"mailto:[email protected]">[email protected]</a>&gt; a =
=C3=A9crit&nbsp;:
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
Bonjour,
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
--
</div>
<div>
Sent with my Delta Chat Messenger:=20
<a href=3D"https://delta.chat" rel=3D"noopener" target=3D"_blank">https=
://delta.chat</a>
<br>
</div>
</blockquote>=20
</body>
</html>
------=_Part_1287438_2124736777.1596732031007
Content-Type: image/png
Content-Transfer-Encoding: base64
Content-ID: <d5cd260f-1b1f-4bfa-81d7-c0a74923b979>
Content-Disposition: inline
iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAYAAAA0n5+2AAAgAElEQVR4nCzaZ3sjhIGo7fyMPSch
MN29S7YsW71avfcuWZJlS3K33HuvY08fpjEVhoGhDh0CSSAESEhCgDRSKNns5mx2k8AM5Dznw/v+
[scrubbed]
/kNZ/B9Uub32fzngSwAAAABJRU5ErkJggg==
------=_Part_1287438_2124736777.1596732031007--

0 comments on commit 1a736ca

Please sign in to comment.