Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix URL preview errors when previewing XML documents #11196

Merged
merged 3 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11196.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
9 changes: 6 additions & 3 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,9 +718,12 @@ def decode_body(
if not body:
return None

# The idea here is that multiple encodings are tried until one works.
# Unfortunately the result is never used and then LXML will decode the string
# again with the found encoding.
for encoding in get_html_media_encodings(body, content_type):
try:
body_str = body.decode(encoding)
body.decode(encoding)
except Exception:
pass
else:
Expand All @@ -732,11 +735,11 @@ def decode_body(
from lxml import etree

# Create an HTML parser.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
parser = etree.HTMLParser(recover=True, encoding=encoding)

# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_str, parser)
return etree.fromstring(body, parser)


def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,21 @@ def test_no_tree(self):
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)

def test_xml(self):
"""Test decoding XML and ensure it works properly."""
# Note that the strip() call is important to ensure the xml tag starts
# at the initial byte.
html = b"""
<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = b"""
Expand Down