diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index c71947ee..b886ca50 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -48,6 +48,7 @@ HTMLInputStream, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.serializer import ( + escape, HTMLSerializer, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib._tokenizer import ( diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index bc66ad2a..0f5b7cc5 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -371,6 +371,10 @@ def sanitize_token(self, token): elif token_type == "Comment": if not self.strip_html_comments: + # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' + token["data"] = html5lib_shim.escape( + token["data"], entities={'"': """, "'": "'"} + ) return token else: return None diff --git a/tests/test_clean.py b/tests/test_clean.py index 1cd58df0..7c565750 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -739,6 +739,53 @@ def test_namespace_rc_data_element_strip_false( ) +@pytest.mark.parametrize( + "namespace_tag, end_tag, data, expected", + [ + ( + "math", + "p", + "
", + "", + ), + ( + "math", + "br", + "