text: replace BoldSpan/ItalicSpan with text properties

This representation is simpler to process and more flexible, especially with overlapping spans of bold/italic.
ricklupton · Oct 5, 2023 · e0ea16b · e0ea16b
1 parent 8760868
commit e0ea16b
Show file tree

Hide file tree

Showing 3 changed files with 255 additions and 101 deletions.
diff --git a/src/rmscene/text.py b/src/rmscene/text.py
@@ -68,35 +68,31 @@ def expand_text_items(
 
 @dataclass
 class CrdtStr:
+    """String with CrdtIds for chars and optional properties.
+
+    The properties apply to the whole `CrdtStr`. Use a list of
+    `CrdtStr`s to represent a sequence of spans of text with different
+    properties.
+
+    """
+
     s: str = ""
     i: list[CrdtId] = field(default_factory=list)
+    properties: dict = field(default_factory=dict)
 
     def __str__(self):
         return self.s
 
 
-@dataclass
-class TextSpan:
-    """Base class for text spans with formatting."""
-
-    contents: list[tp.Union["TextSpan", CrdtStr]]
-
-
-class BoldSpan(TextSpan):
-    pass
-
-
-class ItalicSpan(TextSpan):
-    pass
-
-
 @dataclass
 class Paragraph:
     """Paragraph of text."""
 
-    contents: list[TextSpan]
+    contents: list[CrdtStr]
     start_id: CrdtId
-    style: LwwValue[si.ParagraphStyle]
+    style: LwwValue[si.ParagraphStyle] = field(
+        default_factory=lambda: LwwValue(CrdtId(0, 0), si.ParagraphStyle.PLAIN)
+    )
 
     def __str__(self):
         return "".join(str(s) for s in self.contents)
@@ -120,78 +116,53 @@ def from_scene_item(cls, text: si.Text):
         # Expand from strings to characters
         char_items = CrdtSequence(expand_text_items(text.items.sequence_items()))
         keys = list(char_items)
-        last_linebreak = si.END_MARKER
-
-        span_start_codes = {
-            1: BoldSpan,
-            3: ItalicSpan,
-        }
-        span_end_codes = {
-            2: BoldSpan,
-            4: ItalicSpan,
-        }
+        properties = {"font-weight": "normal", "font-style": "normal"}
+
+        def handle_formatting_code(code):
+            if code == 1:
+                properties["font-weight"] = "bold"
+            elif code == 2:
+                properties["font-weight"] = "normal"
+            if code == 3:
+                properties["font-style"] = "italic"
+            elif code == 4:
+                properties["font-style"] = "normal"
+            else:
+                _logger.warning("Unknown formatting code in text: %d", code)
+            return properties
 
         def parse_paragraph_contents():
-            nonlocal last_linebreak
-            stack = [(None, [])]
-            k = None
-            done = False
+            if keys and char_items[keys[0]] == "\n":
+                start_id = keys.pop(0)
+            else:
+                start_id = si.END_MARKER
+            contents = []
             while keys:
-                # If we've seen a newline character, only interested in
-                # span-closing format codes.
-                if done and char_items[keys[0]] not in (2, 4):
-                    break
-
-                k = keys.pop(0)
-                char = char_items[k]
+                char = char_items[keys[0]]
                 if isinstance(char, int):
-                    if char in span_start_codes:
-                        span_type = span_start_codes[char]
-                        stack.append((span_type, []))
-                    elif char in span_end_codes:
-                        span_type, nested = stack.pop()
-                        if span_type is not span_end_codes[char]:
-                            _logger.error(
-                                "Unexpected end of span at %s: got %s, expected %s",
-                                k,
-                                span_end_codes[char],
-                                span_type,
-                            )
-                        if span_type is not None:
-                            stack[-1][1].append(span_type(nested))
-                    else:
-                        _logger.warning("Unknown format code %d at %s!", char, k)
+                    handle_formatting_code(char)
                 elif char == "\n":
                     # End of paragraph
-                    done = True
-                    last_linebreak = k
+                    break
                 else:
                     assert len(char) <= 1
-                    _, contents = stack[-1]
-                    if not contents or not isinstance(contents[-1], CrdtStr):
-                        contents += [CrdtStr()]
+                    # Start a new string if text properties have changed
+                    if not contents or contents[-1].properties != properties:
+                        contents += [CrdtStr(properties=properties.copy())]
                     contents[-1].s += char
-                    contents[-1].i += [k]
+                    contents[-1].i += [keys[0]]
+                keys.pop(0)
 
-            if len(stack) > 1:
-                _logger.error("Unbalanced stack! %s", stack)
-
-            _, contents = stack[-1]
-            return contents
+            return start_id, contents
 
         paragraphs = []
         while keys:
-            style = text.styles.get(
-                last_linebreak, LwwValue(CrdtId(0, 0), si.ParagraphStyle.PLAIN)
-            )
-            contents = parse_paragraph_contents()
-            p = Paragraph(contents, last_linebreak, style)
+            start_id, contents = parse_paragraph_contents()
+            if start_id in text.styles:
+                p = Paragraph(contents, start_id, text.styles[start_id])
+            else:
+                p = Paragraph(contents, start_id)
             paragraphs += [p]
 
         doc = cls(paragraphs)
         return doc
-
-        # if k in char_formats:
-        #     current_format = char_formats[k]
-        #     if char != "\n":
-        #         _logger.warning("format does not apply to whole line")
diff --git a/tests/test_text.py b/tests/test_text.py
@@ -1,5 +1,13 @@
-from rmscene.text import expand_text_item
-from rmscene import CrdtId, CrdtSequenceItem
+import pytest
+from rmscene.text import (
+    expand_text_item,
+    expand_text_items,
+    TextDocument,
+    CrdtStr,
+    Paragraph,
+)
+from rmscene import scene_items as si
+from rmscene import CrdtId, CrdtSequenceItem, CrdtSequence
 
 
 def cid(k):
@@ -44,3 +52,174 @@ def test_expand_text_empty():
         make_item(21, 20, 22, 1, ""),
         make_item(22, 21, 0, 1, ""),
     ]
+
+
+START_BOLD = 1
+END_BOLD = 2
+START_ITALIC = 3
+END_ITALIC = 4
+
+
+def doc_from_items(items):
+    root_text = si.Text(
+        items=CrdtSequence(items),
+        styles={},
+        pos_x=-468.0,
+        pos_y=234.0,
+        width=936.0,
+    )
+    doc = TextDocument.from_scene_item(root_text)
+    return doc
+
+
+def test_inline_formatting_italic_over_paragraphs():
+    doc = doc_from_items(
+        [
+            make_item(20, 0, 0, 0, "A"),
+            make_item(21, 20, 0, 0, "B\nC"),
+            make_item(24, 23, 0, 0, "D"),
+            # Start italic between A and B
+            make_item(30, 20, 21, 0, START_ITALIC),
+            # End italic between C and D
+            make_item(31, 23, 24, 0, END_ITALIC),
+        ]
+    )
+
+    assert doc.contents == [
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "A",
+                    [CrdtId(1, 20)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+                CrdtStr(
+                    "B",
+                    [CrdtId(1, 21)],
+                    {"font-weight": "normal", "font-style": "italic"},
+                ),
+            ],
+            start_id=CrdtId(0, 0),
+        ),
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "C",
+                    [CrdtId(1, 23)],
+                    {"font-weight": "normal", "font-style": "italic"},
+                ),
+                CrdtStr(
+                    "D",
+                    [CrdtId(1, 24)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+            ],
+            start_id=CrdtId(1, 22),
+        ),
+    ]
+
+
+def test_inline_formatting_italic_over_paragraphs():
+    doc = doc_from_items(
+        [
+            make_item(20, 0, 0, 0, "A"),
+            make_item(21, 20, 0, 0, "B\nC"),
+            make_item(24, 23, 0, 0, "D"),
+            # Start italic between A and B
+            make_item(30, 20, 21, 0, START_ITALIC),
+            # End italic between C and D
+            make_item(31, 23, 24, 0, END_ITALIC),
+        ]
+    )
+
+    assert doc.contents == [
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "A",
+                    [CrdtId(1, 20)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+                CrdtStr(
+                    "B",
+                    [CrdtId(1, 21)],
+                    {"font-weight": "normal", "font-style": "italic"},
+                ),
+            ],
+            start_id=CrdtId(0, 0),
+        ),
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "C",
+                    [CrdtId(1, 23)],
+                    {"font-weight": "normal", "font-style": "italic"},
+                ),
+                CrdtStr(
+                    "D",
+                    [CrdtId(1, 24)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+            ],
+            start_id=CrdtId(1, 22),
+        ),
+    ]
+
+
+def test_inline_formatting_bold_italic_interleaved_over_paragraphs():
+    doc = doc_from_items(
+        [
+            make_item(20, 0, 0, 0, "ABC\nDEF"),
+            # Start italic between A and B
+            make_item(30, 20, 21, 0, START_ITALIC),
+            # Start bold between B and C
+            make_item(31, 21, 22, 0, START_BOLD),
+            # End italic between D and E
+            make_item(32, 24, 25, 0, END_ITALIC),
+            # End bold between E and F
+            make_item(33, 25, 26, 0, END_BOLD),
+        ]
+    )
+
+    assert doc.contents == [
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "A",
+                    [CrdtId(1, 20)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+                CrdtStr(
+                    "B",
+                    [CrdtId(1, 21)],
+                    {"font-weight": "normal", "font-style": "italic"},
+                ),
+                CrdtStr(
+                    "C",
+                    [CrdtId(1, 22)],
+                    {"font-weight": "bold", "font-style": "italic"},
+                ),
+            ],
+            start_id=CrdtId(0, 0),
+        ),
+        Paragraph(
+            contents=[
+                CrdtStr(
+                    "D",
+                    [CrdtId(1, 24)],
+                    {"font-weight": "bold", "font-style": "italic"},
+                ),
+                CrdtStr(
+                    "E",
+                    [CrdtId(1, 25)],
+                    {"font-weight": "bold", "font-style": "normal"},
+                ),
+                CrdtStr(
+                    "F",
+                    [CrdtId(1, 26)],
+                    {"font-weight": "normal", "font-style": "normal"},
+                ),
+            ],
+            start_id=CrdtId(1, 23),
+        ),
+    ]