From ba96f34dc14c0d8cd274f1c9d9e56f2187707710 Mon Sep 17 00:00:00 2001
From: Chris Sewell <chrisj_sewell@hotmail.com>
Date: Fri, 2 Jun 2023 15:18:38 +0200
Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20backslash=20esc?=
 =?UTF-8?q?ape=20logic=20(#276)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 markdown_it/parser_core.py                    | 11 ++-
 markdown_it/parser_inline.py                  |  9 +-
 markdown_it/presets/commonmark.py             |  4 +-
 markdown_it/presets/zero.py                   |  7 +-
 markdown_it/rules_core/__init__.py            |  2 +
 markdown_it/rules_core/text_join.py           | 34 +++++++
 markdown_it/rules_inline/__init__.py          |  4 +-
 markdown_it/rules_inline/escape.py            | 91 +++++++++++--------
 .../{text_collapse.py => fragments_join.py}   |  2 +-
 tests/test_api/test_main.py                   | 35 +++----
 tests/test_port/fixtures/linkify.md           |  9 +-
 tests/test_port/fixtures/smartquotes.md       | 15 ++-
 tests/test_port/fixtures/typographer.md       | 17 ++++
 13 files changed, 173 insertions(+), 67 deletions(-)
 create mode 100644 markdown_it/rules_core/text_join.py
 rename markdown_it/rules_inline/{text_collapse.py => fragments_join.py} (96%)

diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py
index 251b7634..b3eb8abe 100644
--- a/markdown_it/parser_core.py
+++ b/markdown_it/parser_core.py
@@ -7,7 +7,15 @@
 from __future__ import annotations
 
 from .ruler import RuleFunc, Ruler
-from .rules_core import block, inline, linkify, normalize, replace, smartquotes
+from .rules_core import (
+    block,
+    inline,
+    linkify,
+    normalize,
+    replace,
+    smartquotes,
+    text_join,
+)
 from .rules_core.state_core import StateCore
 
 _rules: list[tuple[str, RuleFunc]] = [
@@ -17,6 +25,7 @@
     ("linkify", linkify),
     ("replacements", replace),
     ("smartquotes", smartquotes),
+    ("text_join", text_join),
 ]
 
 
diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py
index a8228524..88140d3d 100644
--- a/markdown_it/parser_inline.py
+++ b/markdown_it/parser_inline.py
@@ -28,11 +28,18 @@
     ("entity", rules_inline.entity),
 ]
 
+# Note `rule2` ruleset was created specifically for emphasis/strikethrough
+# post-processing and may be changed in the future.
+#
+# Don't use this for anything except pairs (plugins working with `balance_pairs`).
+#
 _rules2: list[tuple[str, RuleFunc]] = [
     ("balance_pairs", rules_inline.link_pairs),
     ("strikethrough", rules_inline.strikethrough.postProcess),
     ("emphasis", rules_inline.emphasis.postProcess),
-    ("text_collapse", rules_inline.text_collapse),
+    # rules for pairs separate '**' into its own text tokens, which may be left unused,
+    # rule below merges unused segments back with the rest of the text
+    ("fragments_join", rules_inline.fragments_join),
 ]
 
 
diff --git a/markdown_it/presets/commonmark.py b/markdown_it/presets/commonmark.py
index 60a39250..3990d434 100644
--- a/markdown_it/presets/commonmark.py
+++ b/markdown_it/presets/commonmark.py
@@ -40,7 +40,7 @@ def make() -> PresetType:
             "highlight": None,
         },
         "components": {
-            "core": {"rules": ["normalize", "block", "inline"]},
+            "core": {"rules": ["normalize", "block", "inline", "text_join"]},
             "block": {
                 "rules": [
                     "blockquote",
@@ -68,7 +68,7 @@ def make() -> PresetType:
                     "newline",
                     "text",
                 ],
-                "rules2": ["balance_pairs", "emphasis", "text_collapse"],
+                "rules2": ["balance_pairs", "emphasis", "fragments_join"],
             },
         },
     }
diff --git a/markdown_it/presets/zero.py b/markdown_it/presets/zero.py
index fcc5eb3a..2f69a58d 100644
--- a/markdown_it/presets/zero.py
+++ b/markdown_it/presets/zero.py
@@ -33,8 +33,11 @@ def make() -> PresetType:
             "highlight": None,
         },
         "components": {
-            "core": {"rules": ["normalize", "block", "inline"]},
+            "core": {"rules": ["normalize", "block", "inline", "text_join"]},
             "block": {"rules": ["paragraph"]},
-            "inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]},
+            "inline": {
+                "rules": ["text"],
+                "rules2": ["balance_pairs", "fragments_join"],
+            },
         },
     }
diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py
index f80034c5..c9c5368c 100644
--- a/markdown_it/rules_core/__init__.py
+++ b/markdown_it/rules_core/__init__.py
@@ -6,6 +6,7 @@
     "replace",
     "smartquotes",
     "linkify",
+    "text_join",
 )
 
 from .block import block
@@ -15,3 +16,4 @@
 from .replacements import replace
 from .smartquotes import smartquotes
 from .state_core import StateCore
+from .text_join import text_join
diff --git a/markdown_it/rules_core/text_join.py b/markdown_it/rules_core/text_join.py
new file mode 100644
index 00000000..d54ccbbc
--- /dev/null
+++ b/markdown_it/rules_core/text_join.py
@@ -0,0 +1,34 @@
+"""Join raw text tokens with the rest of the text
+
+This is set as a separate rule to provide an opportunity for plugins
+to run text replacements after text join, but before escape join.
+
+For example, `\\:)` shouldn't be replaced with an emoji.
+"""
+from __future__ import annotations
+
+from ..token import Token
+from .state_core import StateCore
+
+
+def text_join(state: StateCore) -> None:
+    """Join raw text for escape sequences (`text_special`) tokens with the rest of the text"""
+
+    for inline_token in state.tokens[:]:
+        if inline_token.type != "inline":
+            continue
+
+        # convert text_special to text and join all adjacent text nodes
+        new_tokens: list[Token] = []
+        for child_token in inline_token.children or []:
+            if child_token.type == "text_special":
+                child_token.type = "text"
+            if (
+                child_token.type == "text"
+                and new_tokens
+                and new_tokens[-1].type == "text"
+            ):
+                new_tokens[-1].content += child_token.content
+            else:
+                new_tokens.append(child_token)
+        inline_token.children = new_tokens
diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py
index f27907ce..dde97d34 100644
--- a/markdown_it/rules_inline/__init__.py
+++ b/markdown_it/rules_inline/__init__.py
@@ -1,7 +1,7 @@
 __all__ = (
     "StateInline",
     "text",
-    "text_collapse",
+    "fragments_join",
     "link_pairs",
     "escape",
     "newline",
@@ -20,10 +20,10 @@
 from .balance_pairs import link_pairs
 from .entity import entity
 from .escape import escape
+from .fragments_join import fragments_join
 from .html_inline import html_inline
 from .image import image
 from .link import link
 from .newline import newline
 from .state_inline import StateInline
 from .text import text
-from .text_collapse import text_collapse
diff --git a/markdown_it/rules_inline/escape.py b/markdown_it/rules_inline/escape.py
index 8694cec1..9f68b5df 100644
--- a/markdown_it/rules_inline/escape.py
+++ b/markdown_it/rules_inline/escape.py
@@ -4,6 +4,58 @@
 from ..common.utils import isStrSpace
 from .state_inline import StateInline
 
+
+def escape(state: StateInline, silent: bool) -> bool:
+    """Process escaped chars and hardbreaks."""
+    pos = state.pos
+    maximum = state.posMax
+
+    if state.src[pos] != "\\":
+        return False
+
+    pos += 1
+
+    # '\' at the end of the inline block
+    if pos >= maximum:
+        return False
+
+    ch1 = state.src[pos]
+    ch1_ord = ord(ch1)
+    if ch1 == "\n":
+        if not silent:
+            state.push("hardbreak", "br", 0)
+        pos += 1
+        # skip leading whitespaces from next line
+        while pos < maximum:
+            ch = state.src[pos]
+            if not isStrSpace(ch):
+                break
+            pos += 1
+
+        state.pos = pos
+        return True
+
+    escapedStr = state.src[pos]
+
+    if ch1_ord >= 0xD800 and ch1_ord <= 0xDBFF and pos + 1 < maximum:
+        ch2 = state.src[pos + 1]
+        ch2_ord = ord(ch2)
+        if ch2_ord >= 0xDC00 and ch2_ord <= 0xDFFF:
+            escapedStr += ch2
+            pos += 1
+
+    origStr = "\\" + escapedStr
+
+    if not silent:
+        token = state.push("text_special", "", 0)
+        token.content = escapedStr if ch1 in _ESCAPED else origStr
+        token.markup = origStr
+        token.info = "escape"
+
+    state.pos = pos + 1
+    return True
+
+
 _ESCAPED = {
     "!",
     '"',
@@ -38,42 +90,3 @@
     "}",
     "~",
 }
-
-
-def escape(state: StateInline, silent: bool) -> bool:
-    pos = state.pos
-    maximum = state.posMax
-
-    if state.src[pos] != "\\":
-        return False
-
-    pos += 1
-
-    if pos < maximum:
-        ch = state.src[pos]
-
-        if ch in _ESCAPED:
-            if not silent:
-                state.pending += state.src[pos]
-            state.pos += 2
-            return True
-
-        if ch == "\n":
-            if not silent:
-                state.push("hardbreak", "br", 0)
-
-            pos += 1
-            # skip leading whitespaces from next line
-            while pos < maximum:
-                ch = state.src[pos]
-                if not isStrSpace(ch):
-                    break
-                pos += 1
-
-            state.pos = pos
-            return True
-
-    if not silent:
-        state.pending += "\\"
-    state.pos += 1
-    return True
diff --git a/markdown_it/rules_inline/text_collapse.py b/markdown_it/rules_inline/fragments_join.py
similarity index 96%
rename from markdown_it/rules_inline/text_collapse.py
rename to markdown_it/rules_inline/fragments_join.py
index e09289cf..f795c136 100644
--- a/markdown_it/rules_inline/text_collapse.py
+++ b/markdown_it/rules_inline/fragments_join.py
@@ -1,7 +1,7 @@
 from .state_inline import StateInline
 
 
-def text_collapse(state: StateInline) -> None:
+def fragments_join(state: StateInline) -> None:
     """
     Clean up tokens after emphasis and strikethrough postprocessing:
     merge adjacent text nodes into one and re-calculate all token levels
diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py
index c3a9ac8b..64a2bbe8 100644
--- a/tests/test_api/test_main.py
+++ b/tests/test_api/test_main.py
@@ -13,6 +13,7 @@ def test_get_rules():
             "linkify",
             "replacements",
             "smartquotes",
+            "text_join",
         ],
         "block": [
             "table",
@@ -40,7 +41,7 @@ def test_get_rules():
             "html_inline",
             "entity",
         ],
-        "inline2": ["balance_pairs", "strikethrough", "emphasis", "text_collapse"],
+        "inline2": ["balance_pairs", "strikethrough", "emphasis", "fragments_join"],
     }
 
 
@@ -48,13 +49,13 @@ def test_load_presets():
     md = MarkdownIt("zero")
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md = MarkdownIt("commonmark")
     assert md.get_active_rules() == {
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "block": [
             "code",
             "fence",
@@ -79,7 +80,7 @@ def test_load_presets():
             "html_inline",
             "entity",
         ],
-        "inline2": ["balance_pairs", "emphasis", "text_collapse"],
+        "inline2": ["balance_pairs", "emphasis", "fragments_join"],
     }
 
 
@@ -94,16 +95,16 @@ def test_enable():
     md = MarkdownIt("zero").enable("heading")
     assert md.get_active_rules() == {
         "block": ["heading", "paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md.enable(["backticks", "autolink"])
     assert md.get_active_rules() == {
         "block": ["heading", "paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text", "backticks", "autolink"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
@@ -111,16 +112,16 @@ def test_disable():
     md = MarkdownIt("zero").disable("inline")
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block"],
+        "core": ["normalize", "block", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md.disable(["text"])
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block"],
+        "core": ["normalize", "block", "text_join"],
         "inline": [],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
@@ -130,15 +131,15 @@ def test_reset():
         md.disable("inline")
         assert md.get_active_rules() == {
             "block": ["paragraph"],
-            "core": ["normalize", "block"],
+            "core": ["normalize", "block", "text_join"],
             "inline": ["text"],
-            "inline2": ["balance_pairs", "text_collapse"],
+            "inline2": ["balance_pairs", "fragments_join"],
         }
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
diff --git a/tests/test_port/fixtures/linkify.md b/tests/test_port/fixtures/linkify.md
index 9edb78f3..c9755c03 100644
--- a/tests/test_port/fixtures/linkify.md
+++ b/tests/test_port/fixtures/linkify.md
@@ -96,4 +96,11 @@ after
 <p>before</p>
 <p><a href="http://github.com">github.com</a></p>
 <p>after</p>
-.
\ No newline at end of file
+.
+
+Don't match escaped
+.
+google\.com
+.
+<p>google.com</p>
+.
diff --git a/tests/test_port/fixtures/smartquotes.md b/tests/test_port/fixtures/smartquotes.md
index 70378b8e..e77175aa 100644
--- a/tests/test_port/fixtures/smartquotes.md
+++ b/tests/test_port/fixtures/smartquotes.md
@@ -163,4 +163,17 @@ Should parse quotes adjacent to inline html, #677:
 .
 <p>“test <br>”</p>
 <p>“<br> test”</p>
-.
\ No newline at end of file
+.
+
+Should be escapable:
+.
+"foo"
+
+\"foo"
+
+"foo\"
+.
+<p>“foo”</p>
+<p>&quot;foo&quot;</p>
+<p>&quot;foo&quot;</p>
+.
diff --git a/tests/test_port/fixtures/typographer.md b/tests/test_port/fixtures/typographer.md
index 39154ed0..59e48941 100644
--- a/tests/test_port/fixtures/typographer.md
+++ b/tests/test_port/fixtures/typographer.md
@@ -81,6 +81,13 @@ dupes-ellipsis
 <p>!.. ?.. ,… !!!.. ???.. ,…</p>
 .
 
+copyright should be escapable
+.
+\(c)
+.
+<p>(c)</p>
+.
+
 
 dashes
 .
@@ -101,6 +108,16 @@ markdownit--awesome
 <p>markdownit–awesome</p>
 .
 
+dashes should be escapable
+.
+foo \-- bar
+
+foo -\- bar
+.
+<p>foo -- bar</p>
+<p>foo -- bar</p>
+.
+
 regression tests for #624
 .
 1---2---3