From 4ba8d5ae516de842047498417614aebc917440d3 Mon Sep 17 00:00:00 2001 From: Hocuri Date: Wed, 19 Aug 2020 12:20:01 +0200 Subject: [PATCH 1/2] Fix #1804: remove and accept invalid HTML This fixes #1804 in two ways: First, it removes a from the start of the mail, if there is any. Then, it parses the html itself it quick-xml fails, just stripping everything between < and >. Both of these would have fixed this specific issue. Also, add tests for both fixes. --- src/dc_receive_imf.rs | 22 ++++++ src/dehtml.rs | 50 +++++++++++++- test-data/message/wrong-html.eml | 114 +++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 test-data/message/wrong-html.eml diff --git a/src/dc_receive_imf.rs b/src/dc_receive_imf.rs index 8055d5892f..5155629463 100644 --- a/src/dc_receive_imf.rs +++ b/src/dc_receive_imf.rs @@ -2623,4 +2623,26 @@ mod tests { ); assert_eq!(last_msg.from_id, DC_CONTACT_ID_INFO); } + + #[async_std::test] + async fn test_html_only_mail() { + let t = TestContext::new_alice().await; + t.ctx + .set_config(Config::ShowEmails, Some("2")) + .await + .unwrap(); + dc_receive_imf( + &t.ctx, + include_bytes!("../test-data/message/wrong-html.eml"), + "INBOX", + 0, + false, + ) + .await + .unwrap(); + let chats = Chatlist::try_load(&t.ctx, 0, None, None).await.unwrap(); + let msg_id = chats.get_msg_id(0).unwrap(); + let msg = Message::load_from_db(&t.ctx, msg_id).await.unwrap(); + assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut รค... \n\n MfG [...]"); + } } diff --git a/src/dehtml.rs b/src/dehtml.rs index 9e2f6fb169..372baa4106 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -25,7 +25,19 @@ enum AddText { // dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as // the newlines are typically removed in further processing by the caller pub fn dehtml(buf: &str) -> String { - let buf = buf.trim(); + let s = dehtml_quick_xml(buf); + if !s.trim().is_empty() { + return s; + } + let s = dehtml_manually(buf); + if !s.trim().is_empty() { + return s; + } + buf.to_string() +} + +pub fn dehtml_quick_xml(buf: &str) -> String { + let buf = buf.trim().trim_start_matches(""); let mut dehtml = Dehtml { strbuilder: String::with_capacity(buf.len()), @@ -171,6 +183,24 @@ fn dehtml_starttag_cb( } } +pub fn dehtml_manually(buf: &str) -> String { + // Just strip out everything between "<" and ">" + let mut strbuilder = String::new(); + let mut show_next_chars = true; + for c in buf.chars() { + match c { + '<' => show_next_chars = false, + '>' => show_next_chars = true, + _ => { + if show_next_chars { + strbuilder.push(c) + } + } + } + } + strbuilder +} + #[cfg(test)] mod tests { use super::*; @@ -254,4 +284,22 @@ mod tests { let txt = dehtml(input); assert_eq!(txt.trim(), "lots of text"); } + + #[test] + fn test_doctype_html() { + use crate::simplify::simplify; + + let input = "\nfat text"; + let txt = simplify(dehtml(input), false).0; + assert_eq!(txt.trim(), "*fat text*"); + + let input = "\nsome text"; + let txt = simplify(dehtml(input), false).0; + assert_eq!(txt.trim(), "some text"); + // at least DC should show the text if the html is invalid + + let input = ""; + let txt = simplify(dehtml(input), false).0; + assert_eq!(txt.trim(), ""); + } } diff --git a/test-data/message/wrong-html.eml b/test-data/message/wrong-html.eml new file mode 100644 index 0000000000..9540209c66 --- /dev/null +++ b/test-data/message/wrong-html.eml @@ -0,0 +1,114 @@ +Return-Path: +X-Original-To: alice@example.com +Delivered-To: m045a7e8@dd37930.kasserver.com +Received: from mout.kundenserver.de (mout.kundenserver.de [212.227.126.131]) + by dd37930.kasserver.com (Postfix) with ESMTPS id 271F34B4258C + for ; Thu, 6 Aug 2020 18:40:32 +0200 (CEST) +Received: from oxbsltgw18.schlund.de ([172.19.249.35]) by + mrelayeu.kundenserver.de (mreue009 [213.165.67.103]) with ESMTPSA (Nemesis) + id 1MpDRv-1kW93Y0lGZ-00qjvh for ; Thu, 06 Aug 2020 + 18:40:31 +0200 +Date: Thu, 6 Aug 2020 18:40:30 +0200 (CEST) +From: Camping +Reply-To: Camping +To: Alice +Message-ID: <512278196.1287440.1596732031020@email.ionos.fr> +Subject: Re: subj? +MIME-Version: 1.0 +Content-Type: multipart/related; + boundary="----=_Part_1287438_2124736777.1596732031007" +X-Mailer: Open-Xchange Mailer v7.10.1-Rev32 +X-Originating-Client: open-xchange-appsuite + +------=_Part_1287438_2124736777.1596732031007 +MIME-Version: 1.0 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + + + + =20 + =20 + + +
+ Guten Abend, +
+
+
+
+
+
+ Lots of text +
+
+
+ text with Umlaut =C3=A4... +
+
+
+ MfG +
+
+
+
+
+
+

--------------------------------------

+

Camping

+

someaddress

+

sometown

+

3D""
= +

+
+
+
+ Le 5 ao=C3=BBt 2020 =C3=A0 10:46, holger < + holger@somedomain.de> a = +=C3=A9crit : +
+
+
+
+
+
+
+
+ Bonjour, +
+
+
+
+
+
+
+
+
+
+
+ -- +
+
+ Sent with my Delta Chat Messenger:=20 + https= +://delta.chat +
+
+
=20 + + +------=_Part_1287438_2124736777.1596732031007 +Content-Type: image/png +Content-Transfer-Encoding: base64 +Content-ID: +Content-Disposition: inline + +iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAYAAAA0n5+2AAAgAElEQVR4nCzaZ3sjhIGo7fyMPSch +MN29S7YsW71avfcuWZJlS3K33HuvY08fpjEVhoGhDh0CSSAESEhCgDRSKNns5mx2k8AM5Dznw/v+ +[scrubbed] +/kNZ/B9Uub32fzngSwAAAABJRU5ErkJggg== +------=_Part_1287438_2124736777.1596732031007-- + From 477c46c3a2980f39091de1acf4276ee65a407167 Mon Sep 17 00:00:00 2001 From: Hocuri Date: Wed, 19 Aug 2020 17:00:40 +0200 Subject: [PATCH 2/2] Fix tests --- src/dehtml.rs | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/src/dehtml.rs b/src/dehtml.rs index 372baa4106..f12b4b9bef 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -204,6 +204,7 @@ pub fn dehtml_manually(buf: &str) -> String { #[cfg(test)] mod tests { use super::*; + use crate::simplify::simplify; #[test] fn test_dehtml() { @@ -212,20 +213,23 @@ mod tests { " Foo ", "[ Foo ](https://example.com)", ), - ("", ""), (" bar ", "* bar *"), (" bar foo", "* bar _ foo"), ("& bar", "& bar"), - // Note missing ' - ("", "[](https://get.delta.chat/)", ), ("", ""), + ("\nfat text", "*fat text*"), + // Invalid html (at least DC should show the text if the html is invalid): + ("\nsome text", "some text"), + ("", ""), ]; for (input, output) in cases { - assert_eq!(dehtml(input), output); + assert_eq!(simplify(dehtml(input), true).0, output); } } @@ -284,22 +288,4 @@ mod tests { let txt = dehtml(input); assert_eq!(txt.trim(), "lots of text"); } - - #[test] - fn test_doctype_html() { - use crate::simplify::simplify; - - let input = "\nfat text"; - let txt = simplify(dehtml(input), false).0; - assert_eq!(txt.trim(), "*fat text*"); - - let input = "\nsome text"; - let txt = simplify(dehtml(input), false).0; - assert_eq!(txt.trim(), "some text"); - // at least DC should show the text if the html is invalid - - let input = ""; - let txt = simplify(dehtml(input), false).0; - assert_eq!(txt.trim(), ""); - } }