Skip to content

Commit

Permalink
Correctly handle string prefix in lxml tree creation (DMOJ#1463)
Browse files Browse the repository at this point in the history
  • Loading branch information
quantum5 authored Jun 19, 2020
1 parent 428cf35 commit 4719e37
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 8 deletions.
29 changes: 22 additions & 7 deletions judge/jinja2/markdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,26 @@ def get_cleaner(name, params):
return cleaner


def fragments_to_tree(fragment):
tree = html.Element('div')
try:
parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True))
except (XMLSyntaxError, ParserError) as e:
if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
logger.exception('Failed to parse HTML string')
return tree

if parsed and isinstance(parsed[0], str):
tree.text = parsed[0]
parsed = parsed[1:]
tree.extend(parsed)
return tree


def fragment_tree_to_str(tree):
return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')]


@registry.filter
def markdown(value, style, math_engine=None, lazy_load=False):
styles = settings.MARKDOWN_STYLES.get(style, settings.MARKDOWN_DEFAULT_STYLE)
Expand All @@ -152,15 +172,10 @@ def markdown(value, style, math_engine=None, lazy_load=False):
result = markdown(value)

if post_processors:
tree = html.Element('div')
try:
tree.extend(html.fragments_fromstring(result, parser=html.HTMLParser(recover=True)))
except (XMLSyntaxError, ParserError) as e:
if result and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
logger.exception('Failed to parse HTML string')
tree = fragments_to_tree(result)
for processor in post_processors:
processor(tree)
result = html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')]
result = fragment_tree_to_str(tree)
if bleach_params:
result = get_cleaner(style, bleach_params).clean(result)
return Markup(result)
33 changes: 32 additions & 1 deletion judge/jinja2/markdown/test_markdown.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from django.test import SimpleTestCase
from lxml import html

from . import get_cleaner, markdown
from . import fragment_tree_to_str, fragments_to_tree, get_cleaner, markdown

MATHML_N = '''\
<math xmlns="http://www.w3.org/1998/Math/MathML">
Expand Down Expand Up @@ -123,3 +124,33 @@ def test_bleach_mathml(self):
def test_no_bleach(self):
self.assertHTMLEqual(markdown('<script>void(0)</script>', self.UNBLEACHED_STYLE),
'<script>void(0)</script>')

def test_post_process(self):
self.assertHTMLEqual(markdown('<img src="test.png">', self.UNBLEACHED_STYLE, lazy_load=True),
'<p><noscript><img src="test.png"></noscript>'
'<img src="/static/blank.gif" data-src="test.png" class="unveil"></p>')


class TestFragmentUtils(SimpleTestCase):
def test_simple(self):
tree = fragments_to_tree('<p>a</p><p>b</p>')
self.assertIsInstance(tree, html.HtmlElement)
self.assertEqual(len(tree.getchildren()), 2)

self.assertIsInstance(tree[0], html.HtmlElement)
self.assertEqual(tree[0].tag, 'p')
self.assertEqual(tree[0].text, 'a')

self.assertIsInstance(tree[1], html.HtmlElement)
self.assertEqual(tree[1].tag, 'p')
self.assertEqual(tree[1].text, 'b')

self.assertHTMLEqual(fragment_tree_to_str(tree), '<p>a</p><p>b</p>')

def test_text_prefix(self):
tree = fragments_to_tree('z<p>a</p><p>b</p>')
self.assertIsInstance(tree, html.HtmlElement)
self.assertEqual(len(tree.getchildren()), 2)
self.assertEqual(tree.text, 'z')

self.assertHTMLEqual(fragment_tree_to_str(tree), 'z<p>a</p><p>b</p>')

0 comments on commit 4719e37

Please sign in to comment.