Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #1804: remove <!doctype html> and accept invalid HTML #1851

Merged
merged 2 commits into from
Aug 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions src/dc_receive_imf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2623,4 +2623,26 @@ mod tests {
);
assert_eq!(last_msg.from_id, DC_CONTACT_ID_INFO);
}

#[async_std::test]
async fn test_html_only_mail() {
let t = TestContext::new_alice().await;
t.ctx
.set_config(Config::ShowEmails, Some("2"))
.await
.unwrap();
dc_receive_imf(
&t.ctx,
include_bytes!("../test-data/message/wrong-html.eml"),
"INBOX",
0,
false,
)
.await
.unwrap();
let chats = Chatlist::try_load(&t.ctx, 0, None, None).await.unwrap();
let msg_id = chats.get_msg_id(0).unwrap();
let msg = Message::load_from_db(&t.ctx, msg_id).await.unwrap();
assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut ä... \n\n MfG [...]");
}
}
44 changes: 39 additions & 5 deletions src/dehtml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,19 @@ enum AddText {
// dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as
// the newlines are typically removed in further processing by the caller
pub fn dehtml(buf: &str) -> String {
let buf = buf.trim();
let s = dehtml_quick_xml(buf);
if !s.trim().is_empty() {
return s;
}
let s = dehtml_manually(buf);
if !s.trim().is_empty() {
return s;
}
buf.to_string()
}

pub fn dehtml_quick_xml(buf: &str) -> String {
let buf = buf.trim().trim_start_matches("<!doctype html>");

let mut dehtml = Dehtml {
strbuilder: String::with_capacity(buf.len()),
Expand Down Expand Up @@ -171,9 +183,28 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
}
}

pub fn dehtml_manually(buf: &str) -> String {
// Just strip out everything between "<" and ">"
let mut strbuilder = String::new();
let mut show_next_chars = true;
for c in buf.chars() {
match c {
'<' => show_next_chars = false,
'>' => show_next_chars = true,
_ => {
if show_next_chars {
strbuilder.push(c)
}
}
}
}
strbuilder
}

#[cfg(test)]
mod tests {
use super::*;
use crate::simplify::simplify;

#[test]
fn test_dehtml() {
Expand All @@ -182,20 +213,23 @@ mod tests {
"<a href='https://example.com'> Foo </a>",
"[ Foo ](https://example.com)",
),
("<img href='/foo.png'>", ""),
("<b> bar </b>", "* bar *"),
("<b> bar <i> foo", "* bar _ foo"),
("&amp; bar", "& bar"),
// Note missing '
("<a href='/foo.png>Hi</a> ", ""),
// Despite missing ', this should be shown:
("<a href='/foo.png>Hi</a> ", "Hi "),
(
"<a href='https://get.delta.chat/'/>",
"[](https://get.delta.chat/)",
),
("", ""),
("<!doctype html>\n<b>fat text</b>", "*fat text*"),
// Invalid html (at least DC should show the text if the html is invalid):
("<!some invalid html code>\n<b>some text</b>", "some text"),
("<This text is in brackets>", "<This text is in brackets>"),
];
for (input, output) in cases {
assert_eq!(dehtml(input), output);
assert_eq!(simplify(dehtml(input), true).0, output);
}
}

Expand Down
114 changes: 114 additions & 0 deletions test-data/message/wrong-html.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
Return-Path: <[email protected]>
X-Original-To: [email protected]
Delivered-To: [email protected]
Received: from mout.kundenserver.de (mout.kundenserver.de [212.227.126.131])
by dd37930.kasserver.com (Postfix) with ESMTPS id 271F34B4258C
for <[email protected]>; Thu, 6 Aug 2020 18:40:32 +0200 (CEST)
Received: from oxbsltgw18.schlund.de ([172.19.249.35]) by
mrelayeu.kundenserver.de (mreue009 [213.165.67.103]) with ESMTPSA (Nemesis)
id 1MpDRv-1kW93Y0lGZ-00qjvh for <[email protected]>; Thu, 06 Aug 2020
18:40:31 +0200
Date: Thu, 6 Aug 2020 18:40:30 +0200 (CEST)
From: Camping <[email protected]>
Reply-To: Camping <[email protected]>
To: Alice <[email protected]>
Message-ID: <[email protected]>
Subject: Re: subj?
MIME-Version: 1.0
Content-Type: multipart/related;
boundary="----=_Part_1287438_2124736777.1596732031007"
X-Mailer: Open-Xchange Mailer v7.10.1-Rev32
X-Originating-Client: open-xchange-appsuite

------=_Part_1287438_2124736777.1596732031007
MIME-Version: 1.0
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable

<!doctype html>
<html>
<head>=20
<meta charset=3D"UTF-8">=20
</head>
<body>
<div class=3D"default-style">
Guten Abend,
<br>
</div>
<div class=3D"default-style">
<br>
</div>
<div class=3D"default-style">
Lots of text
<br>
</div>
<div class=3D"default-style">
text with Umlaut =C3=A4...
<br>
</div>
<div class=3D"default-style">
MfG
<br>
</div>
<div class=3D"default-style">
<br>
</div>
<div class=3D"io-ox-signature">
<p>--------------------------------------<br></p>
<p style=3D"text-align: left;"><a href=3D"https://example.com=
/">Camping </a><br></p>
<p>someaddress<br></p>
<p>sometown</p>
<p><img alt=3D"" class=3D"aspect-ratio" style=3D"width: 505px; height: 1=
68px; max-width: 100%;" id=3D"d5cd260f-1b1f-4bfa-81d7-c0a74923b979" src=3D"=
cid:d5cd260f-1b1f-4bfa-81d7-c0a74923b979" width=3D"505" height=3D"168"><br>=
</p>
</div>
<blockquote type=3D"cite">
<div>
Le 5 ao=C3=BBt 2020 =C3=A0 10:46, holger &lt;
<a href=3D"mailto:[email protected]">[email protected]</a>&gt; a =
=C3=A9crit&nbsp;:
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
Bonjour,
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
<br>
</div>
<div>
--
</div>
<div>
Sent with my Delta Chat Messenger:=20
<a href=3D"https://delta.chat" rel=3D"noopener" target=3D"_blank">https=
://delta.chat</a>
<br>
</div>
</blockquote>=20
</body>
</html>
------=_Part_1287438_2124736777.1596732031007
Content-Type: image/png
Content-Transfer-Encoding: base64
Content-ID: <d5cd260f-1b1f-4bfa-81d7-c0a74923b979>
Content-Disposition: inline

iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAYAAAA0n5+2AAAgAElEQVR4nCzaZ3sjhIGo7fyMPSch
MN29S7YsW71avfcuWZJlS3K33HuvY08fpjEVhoGhDh0CSSAESEhCgDRSKNns5mx2k8AM5Dznw/v+
[scrubbed]
/kNZ/B9Uub32fzngSwAAAABJRU5ErkJggg==
------=_Part_1287438_2124736777.1596732031007--