diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index c71947ee..b886ca50 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -48,6 +48,7 @@ HTMLInputStream, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.serializer import ( + escape, HTMLSerializer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._tokenizer import ( diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index bc66ad2a..0f5b7cc5 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -371,6 +371,10 @@ def sanitize_token(self, token): elif token_type == "Comment": if not self.strip_html_comments: + # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' + token["data"] = html5lib_shim.escape( + token["data"], entities={'"': """, "'": "'"} + ) return token else: return None diff --git a/tests/test_clean.py b/tests/test_clean.py index 1cd58df0..7c565750 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -739,6 +739,53 @@ def test_namespace_rc_data_element_strip_false( ) +@pytest.mark.parametrize( + "namespace_tag, end_tag, data, expected", + [ + ( + "math", + "p", + "

", + ), + ( + "math", + "br", + "
", + ), + ( + "svg", + "p", + "

", + ), + ( + "svg", + "br", + "
", + ), + ], +) +def test_html_comments_escaped(namespace_tag, end_tag, data, expected): + # refs: bug 1689399 / GHSA-vv2x-vrpj-qqpq + # + # p and br can be just an end tag (e.g.

==

) + # + # In browsers: + # + # * img and other tags break out of the svg or math namespace (e.g. == ) + # * style does not (e.g. == ) + # * the breaking tag ejects trailing elements (e.g. == ) + # + # the ejected elements can trigger XSS + assert ( + clean(data, tags=[namespace_tag, end_tag, "style"], strip_comments=False) + == expected + ) + + def get_ids_and_tests(): """Retrieves regression tests from data/ directory