From ba96f34dc14c0d8cd274f1c9d9e56f2187707710 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Fri, 2 Jun 2023 15:18:38 +0200 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20backslash=20esc?= =?UTF-8?q?ape=20logic=20(#276)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- markdown_it/parser_core.py | 11 ++- markdown_it/parser_inline.py | 9 +- markdown_it/presets/commonmark.py | 4 +- markdown_it/presets/zero.py | 7 +- markdown_it/rules_core/__init__.py | 2 + markdown_it/rules_core/text_join.py | 34 +++++++ markdown_it/rules_inline/__init__.py | 4 +- markdown_it/rules_inline/escape.py | 91 +++++++++++-------- .../{text_collapse.py => fragments_join.py} | 2 +- tests/test_api/test_main.py | 35 +++---- tests/test_port/fixtures/linkify.md | 9 +- tests/test_port/fixtures/smartquotes.md | 15 ++- tests/test_port/fixtures/typographer.md | 17 ++++ 13 files changed, 173 insertions(+), 67 deletions(-) create mode 100644 markdown_it/rules_core/text_join.py rename markdown_it/rules_inline/{text_collapse.py => fragments_join.py} (96%) diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py index 251b7634..b3eb8abe 100644 --- a/markdown_it/parser_core.py +++ b/markdown_it/parser_core.py @@ -7,7 +7,15 @@ from __future__ import annotations from .ruler import RuleFunc, Ruler -from .rules_core import block, inline, linkify, normalize, replace, smartquotes +from .rules_core import ( + block, + inline, + linkify, + normalize, + replace, + smartquotes, + text_join, +) from .rules_core.state_core import StateCore _rules: list[tuple[str, RuleFunc]] = [ @@ -17,6 +25,7 @@ ("linkify", linkify), ("replacements", replace), ("smartquotes", smartquotes), + ("text_join", text_join), ] diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index a8228524..88140d3d 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -28,11 +28,18 @@ ("entity", rules_inline.entity), ] +# Note `rule2` ruleset was created specifically for emphasis/strikethrough +# post-processing and may be changed in the future. +# +# Don't use this for anything except pairs (plugins working with `balance_pairs`). +# _rules2: list[tuple[str, RuleFunc]] = [ ("balance_pairs", rules_inline.link_pairs), ("strikethrough", rules_inline.strikethrough.postProcess), ("emphasis", rules_inline.emphasis.postProcess), - ("text_collapse", rules_inline.text_collapse), + # rules for pairs separate '**' into its own text tokens, which may be left unused, + # rule below merges unused segments back with the rest of the text + ("fragments_join", rules_inline.fragments_join), ] diff --git a/markdown_it/presets/commonmark.py b/markdown_it/presets/commonmark.py index 60a39250..3990d434 100644 --- a/markdown_it/presets/commonmark.py +++ b/markdown_it/presets/commonmark.py @@ -40,7 +40,7 @@ def make() -> PresetType: "highlight": None, }, "components": { - "core": {"rules": ["normalize", "block", "inline"]}, + "core": {"rules": ["normalize", "block", "inline", "text_join"]}, "block": { "rules": [ "blockquote", @@ -68,7 +68,7 @@ def make() -> PresetType: "newline", "text", ], - "rules2": ["balance_pairs", "emphasis", "text_collapse"], + "rules2": ["balance_pairs", "emphasis", "fragments_join"], }, }, } diff --git a/markdown_it/presets/zero.py b/markdown_it/presets/zero.py index fcc5eb3a..2f69a58d 100644 --- a/markdown_it/presets/zero.py +++ b/markdown_it/presets/zero.py @@ -33,8 +33,11 @@ def make() -> PresetType: "highlight": None, }, "components": { - "core": {"rules": ["normalize", "block", "inline"]}, + "core": {"rules": ["normalize", "block", "inline", "text_join"]}, "block": {"rules": ["paragraph"]}, - "inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]}, + "inline": { + "rules": ["text"], + "rules2": ["balance_pairs", "fragments_join"], + }, }, } diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py index f80034c5..c9c5368c 100644 --- a/markdown_it/rules_core/__init__.py +++ b/markdown_it/rules_core/__init__.py @@ -6,6 +6,7 @@ "replace", "smartquotes", "linkify", + "text_join", ) from .block import block @@ -15,3 +16,4 @@ from .replacements import replace from .smartquotes import smartquotes from .state_core import StateCore +from .text_join import text_join diff --git a/markdown_it/rules_core/text_join.py b/markdown_it/rules_core/text_join.py new file mode 100644 index 00000000..d54ccbbc --- /dev/null +++ b/markdown_it/rules_core/text_join.py @@ -0,0 +1,34 @@ +"""Join raw text tokens with the rest of the text + +This is set as a separate rule to provide an opportunity for plugins +to run text replacements after text join, but before escape join. + +For example, `\\:)` shouldn't be replaced with an emoji. +""" +from __future__ import annotations + +from ..token import Token +from .state_core import StateCore + + +def text_join(state: StateCore) -> None: + """Join raw text for escape sequences (`text_special`) tokens with the rest of the text""" + + for inline_token in state.tokens[:]: + if inline_token.type != "inline": + continue + + # convert text_special to text and join all adjacent text nodes + new_tokens: list[Token] = [] + for child_token in inline_token.children or []: + if child_token.type == "text_special": + child_token.type = "text" + if ( + child_token.type == "text" + and new_tokens + and new_tokens[-1].type == "text" + ): + new_tokens[-1].content += child_token.content + else: + new_tokens.append(child_token) + inline_token.children = new_tokens diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py index f27907ce..dde97d34 100644 --- a/markdown_it/rules_inline/__init__.py +++ b/markdown_it/rules_inline/__init__.py @@ -1,7 +1,7 @@ __all__ = ( "StateInline", "text", - "text_collapse", + "fragments_join", "link_pairs", "escape", "newline", @@ -20,10 +20,10 @@ from .balance_pairs import link_pairs from .entity import entity from .escape import escape +from .fragments_join import fragments_join from .html_inline import html_inline from .image import image from .link import link from .newline import newline from .state_inline import StateInline from .text import text -from .text_collapse import text_collapse diff --git a/markdown_it/rules_inline/escape.py b/markdown_it/rules_inline/escape.py index 8694cec1..9f68b5df 100644 --- a/markdown_it/rules_inline/escape.py +++ b/markdown_it/rules_inline/escape.py @@ -4,6 +4,58 @@ from ..common.utils import isStrSpace from .state_inline import StateInline + +def escape(state: StateInline, silent: bool) -> bool: + """Process escaped chars and hardbreaks.""" + pos = state.pos + maximum = state.posMax + + if state.src[pos] != "\\": + return False + + pos += 1 + + # '\' at the end of the inline block + if pos >= maximum: + return False + + ch1 = state.src[pos] + ch1_ord = ord(ch1) + if ch1 == "\n": + if not silent: + state.push("hardbreak", "br", 0) + pos += 1 + # skip leading whitespaces from next line + while pos < maximum: + ch = state.src[pos] + if not isStrSpace(ch): + break + pos += 1 + + state.pos = pos + return True + + escapedStr = state.src[pos] + + if ch1_ord >= 0xD800 and ch1_ord <= 0xDBFF and pos + 1 < maximum: + ch2 = state.src[pos + 1] + ch2_ord = ord(ch2) + if ch2_ord >= 0xDC00 and ch2_ord <= 0xDFFF: + escapedStr += ch2 + pos += 1 + + origStr = "\\" + escapedStr + + if not silent: + token = state.push("text_special", "", 0) + token.content = escapedStr if ch1 in _ESCAPED else origStr + token.markup = origStr + token.info = "escape" + + state.pos = pos + 1 + return True + + _ESCAPED = { "!", '"', @@ -38,42 +90,3 @@ "}", "~", } - - -def escape(state: StateInline, silent: bool) -> bool: - pos = state.pos - maximum = state.posMax - - if state.src[pos] != "\\": - return False - - pos += 1 - - if pos < maximum: - ch = state.src[pos] - - if ch in _ESCAPED: - if not silent: - state.pending += state.src[pos] - state.pos += 2 - return True - - if ch == "\n": - if not silent: - state.push("hardbreak", "br", 0) - - pos += 1 - # skip leading whitespaces from next line - while pos < maximum: - ch = state.src[pos] - if not isStrSpace(ch): - break - pos += 1 - - state.pos = pos - return True - - if not silent: - state.pending += "\\" - state.pos += 1 - return True diff --git a/markdown_it/rules_inline/text_collapse.py b/markdown_it/rules_inline/fragments_join.py similarity index 96% rename from markdown_it/rules_inline/text_collapse.py rename to markdown_it/rules_inline/fragments_join.py index e09289cf..f795c136 100644 --- a/markdown_it/rules_inline/text_collapse.py +++ b/markdown_it/rules_inline/fragments_join.py @@ -1,7 +1,7 @@ from .state_inline import StateInline -def text_collapse(state: StateInline) -> None: +def fragments_join(state: StateInline) -> None: """ Clean up tokens after emphasis and strikethrough postprocessing: merge adjacent text nodes into one and re-calculate all token levels diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py index c3a9ac8b..64a2bbe8 100644 --- a/tests/test_api/test_main.py +++ b/tests/test_api/test_main.py @@ -13,6 +13,7 @@ def test_get_rules(): "linkify", "replacements", "smartquotes", + "text_join", ], "block": [ "table", @@ -40,7 +41,7 @@ def test_get_rules(): "html_inline", "entity", ], - "inline2": ["balance_pairs", "strikethrough", "emphasis", "text_collapse"], + "inline2": ["balance_pairs", "strikethrough", "emphasis", "fragments_join"], } @@ -48,13 +49,13 @@ def test_load_presets(): md = MarkdownIt("zero") assert md.get_active_rules() == { "block": ["paragraph"], - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "text_join"], "inline": ["text"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } md = MarkdownIt("commonmark") assert md.get_active_rules() == { - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "text_join"], "block": [ "code", "fence", @@ -79,7 +80,7 @@ def test_load_presets(): "html_inline", "entity", ], - "inline2": ["balance_pairs", "emphasis", "text_collapse"], + "inline2": ["balance_pairs", "emphasis", "fragments_join"], } @@ -94,16 +95,16 @@ def test_enable(): md = MarkdownIt("zero").enable("heading") assert md.get_active_rules() == { "block": ["heading", "paragraph"], - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "text_join"], "inline": ["text"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } md.enable(["backticks", "autolink"]) assert md.get_active_rules() == { "block": ["heading", "paragraph"], - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "text_join"], "inline": ["text", "backticks", "autolink"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } @@ -111,16 +112,16 @@ def test_disable(): md = MarkdownIt("zero").disable("inline") assert md.get_active_rules() == { "block": ["paragraph"], - "core": ["normalize", "block"], + "core": ["normalize", "block", "text_join"], "inline": ["text"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } md.disable(["text"]) assert md.get_active_rules() == { "block": ["paragraph"], - "core": ["normalize", "block"], + "core": ["normalize", "block", "text_join"], "inline": [], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } @@ -130,15 +131,15 @@ def test_reset(): md.disable("inline") assert md.get_active_rules() == { "block": ["paragraph"], - "core": ["normalize", "block"], + "core": ["normalize", "block", "text_join"], "inline": ["text"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } assert md.get_active_rules() == { "block": ["paragraph"], - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "text_join"], "inline": ["text"], - "inline2": ["balance_pairs", "text_collapse"], + "inline2": ["balance_pairs", "fragments_join"], } diff --git a/tests/test_port/fixtures/linkify.md b/tests/test_port/fixtures/linkify.md index 9edb78f3..c9755c03 100644 --- a/tests/test_port/fixtures/linkify.md +++ b/tests/test_port/fixtures/linkify.md @@ -96,4 +96,11 @@ after

before

github.com

after

-. \ No newline at end of file +. + +Don't match escaped +. +google\.com +. +

google.com

+. diff --git a/tests/test_port/fixtures/smartquotes.md b/tests/test_port/fixtures/smartquotes.md index 70378b8e..e77175aa 100644 --- a/tests/test_port/fixtures/smartquotes.md +++ b/tests/test_port/fixtures/smartquotes.md @@ -163,4 +163,17 @@ Should parse quotes adjacent to inline html, #677: .

“test


test”

-. \ No newline at end of file +. + +Should be escapable: +. +"foo" + +\"foo" + +"foo\" +. +

“foo”

+

"foo"

+

"foo"

+. diff --git a/tests/test_port/fixtures/typographer.md b/tests/test_port/fixtures/typographer.md index 39154ed0..59e48941 100644 --- a/tests/test_port/fixtures/typographer.md +++ b/tests/test_port/fixtures/typographer.md @@ -81,6 +81,13 @@ dupes-ellipsis

!.. ?.. ,… !!!.. ???.. ,…

. +copyright should be escapable +. +\(c) +. +

(c)

+. + dashes . @@ -101,6 +108,16 @@ markdownit--awesome

markdownit–awesome

. +dashes should be escapable +. +foo \-- bar + +foo -\- bar +. +

foo -- bar

+

foo -- bar

+. + regression tests for #624 . 1---2---3