Skip to content

Commit

Permalink
🐛 FIX: numeric character reference passing (#272)
Browse files Browse the repository at this point in the history
Fix issue with incorrect determination of a numeric character reference, and subsequent failure to convert to an integer code.

From https://github.com/google/oss-fuzz/tree/master/projects/markdown-it-py, fixes issue 55371

This also essentially fixes a bug in upstream, see markdown-it/markdown-it#935
  • Loading branch information
chrisjsewell authored Jun 2, 2023
1 parent 36a428b commit 4e6dfd5
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 61 deletions.
74 changes: 20 additions & 54 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""
from __future__ import annotations

import html
import re
from typing import Match, TypeVar

Expand Down Expand Up @@ -52,9 +51,6 @@ def arrayReplaceAt(
return src[:pos] + newElements + src[pos + 1 :]


######################################################################


def isValidEntityCode(c: int) -> bool:
# broken sequence
if c >= 0xD800 and c <= 0xDFFF:
Expand Down Expand Up @@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
return chr(c)


UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
UNESCAPE_ALL_RE = re.compile(
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
re.IGNORECASE,
)
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)


def replaceEntityPattern(match: str, name: str) -> str:
"""Convert HTML entity patterns
::
https://www.google.com -> https%3A//www.google.com
"""Convert HTML entity patterns,
see https://spec.commonmark.org/0.30/#entity-references
"""
code = 0

if name in entities:
return entities[name]

if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
if isValidEntityCode(code):
return fromCodePoint(code)

return match


# def replaceEntities(string):
# if (string.indexOf('&') < 0):
# return string
# return string.replace(ENTITY_RE, replaceEntityPattern)
code: None | int = None
if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
code = int(pat.group(1), 10)
elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
code = int(pat.group(1), 16)

if code is not None and isValidEntityCode(code):
return fromCodePoint(code)

def unescapeMd(string: str) -> str:
raise NotImplementedError
# if "\\" in string:
# return string
# return string.replace(UNESCAPE_MD_RE, "$1")
return match


def unescapeAll(string: str) -> str:
Expand All @@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
return ESCAPE_CHAR.sub(r"\1", string)


# //////////////////////////////////////////////////////////////////////////////

# TODO This section changed quite a lot, should re-check

# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')


# def escapeHtml(string: str):

# if HTML_ESCAPE_REPLACE_RE.search(string):

# string = UNESCAPE_HTML_RE.sub("&", string)
# string = ESCAPE_AND_HTML.sub("&amp;", string)
# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
# string = string.replace(k, v)

# return string


def escapeHtml(raw: str) -> str:
# return html.escape(html.unescape(raw)).replace("&#x27;", "'")
return html.escape(raw).replace("&#x27;", "'")
"""Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
# like html.escape, but without escaping single quotes
raw = raw.replace("&", "&amp;") # Must be done first!
raw = raw.replace("<", "&lt;")
raw = raw.replace(">", "&gt;")
raw = raw.replace('"', "&quot;")
return raw


# //////////////////////////////////////////////////////////////////////////////
Expand Down
14 changes: 7 additions & 7 deletions tests/test_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
from markdown_it import MarkdownIt

TESTS = {
55363: ">```\n>",
55367: ">-\n>\n>",
# 55371: "[](so&#4»0;!" TODO this did not fail
# 55401: "?c_" * 100_000 TODO this did not fail
55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
55371: ("[](so&#4H0;!", "<p>[](so&amp;#4H0;!</p>\n"),
# 55401: (("?c_" * 100000) + "c_", ""), TODO this does not fail, just takes a long time
}


@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
def test_fuzzing(raw_input):
@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
def test_fuzzing(raw_input, expected):
md = MarkdownIt()
md.parse(raw_input)
print(md.render(raw_input))
assert md.render(raw_input) == expected
9 changes: 9 additions & 0 deletions tests/test_port/fixtures/issue-fixes.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,12 @@ Fix CVE-2023-26303
<p><img src="%5B" alt="
" /></p>
.

Fix parsing of incorrect numeric character references
.
[](&#X22y;) &#X22y;
[](&#35y;) &#35y;
.
<p><a href="&amp;#X22y;"></a> &amp;#X22y;
<a href="&amp;#35y;"></a> &amp;#35y;</p>
.

0 comments on commit 4e6dfd5

Please sign in to comment.